From dd06f86b98527b6b6425ea679bea5cc347f5afb4 Mon Sep 17 00:00:00 2001 From: Vignesh Venkatasubramanian Date: Fri, 2 Jun 2023 03:02:16 +0000 Subject: libyuv: Update to r1871 (2a6cb743) Changes from upstream: https://chromium.googlesource.com/libyuv/libyuv/+log/d53f1bee..2a6cb743 The intention of the CL is to import the functions necessary to enable AV1 (and AVIF) 12-bit color conversion. Bug: 268505204 Test: Builds. Media and Camera CTS tests pass. (cherry picked from https://googleplex-android-review.googlesource.com/q/commit:27750a13c6eaacb9f716da3fe1734a8d106d7ff4) Merged-In: I756d3bd5047d4719659f9e1a449217b1940e51a4 Change-Id: I756d3bd5047d4719659f9e1a449217b1940e51a4 --- METADATA | 4 +- README.version | 2 +- files/.gn | 4 +- files/.vpython3 | 4 +- files/Android.bp | 1 + files/BUILD.gn | 22 +- files/CMakeLists.txt | 22 +- files/DEPS | 1091 ++-- files/README.chromium | 2 +- files/README.md | 1 + files/build_overrides/build.gni | 3 + files/build_overrides/partition_alloc.gni | 17 + files/docs/environment_variables.md | 3 + files/docs/getting_started.md | 29 + files/include/libyuv/convert.h | 123 + files/include/libyuv/convert_argb.h | 126 + files/include/libyuv/convert_from_argb.h | 51 +- files/include/libyuv/cpu_id.h | 7 + files/include/libyuv/planar_functions.h | 56 +- files/include/libyuv/rotate.h | 64 + files/include/libyuv/rotate_row.h | 45 + files/include/libyuv/row.h | 1077 +++- files/include/libyuv/scale_row.h | 43 + files/include/libyuv/version.h | 2 +- files/infra/config/PRESUBMIT.py | 2 + files/infra/config/cr-buildbucket.cfg | 252 +- files/infra/config/main.star | 20 +- files/infra/config/project.cfg | 2 +- files/infra/config/realms.cfg | 4 + files/libyuv.gni | 3 +- files/riscv_script/prepare_toolchain_qemu.sh | 74 + files/riscv_script/riscv-clang.cmake | 52 + files/riscv_script/run_qemu.sh | 15 + files/source/compare.cc | 6 +- files/source/compare_gcc.cc | 2 +- files/source/compare_mmi.cc | 123 - files/source/convert.cc | 893 ++- files/source/convert_argb.cc | 1426 ++++- files/source/convert_from.cc | 24 + files/source/convert_from_argb.cc | 1061 +++- files/source/cpu_id.cc | 100 +- files/source/mjpeg_decoder.cc | 4 +- files/source/planar_functions.cc | 659 ++- files/source/rotate.cc | 394 +- files/source/rotate_argb.cc | 16 +- files/source/rotate_common.cc | 127 +- files/source/rotate_gcc.cc | 130 + files/source/rotate_mmi.cc | 291 - files/source/rotate_neon.cc | 40 + files/source/rotate_neon64.cc | 71 +- files/source/row_any.cc | 852 ++- files/source/row_common.cc | 826 ++- files/source/row_gcc.cc | 578 +- files/source/row_lasx.cc | 370 +- files/source/row_lsx.cc | 1580 ++++- files/source/row_mmi.cc | 7842 ------------------------- files/source/row_neon.cc | 268 +- files/source/row_neon64.cc | 255 +- files/source/row_rvv.cc | 956 +++ files/source/row_win.cc | 65 +- files/source/scale.cc | 106 +- files/source/scale_any.cc | 16 + files/source/scale_argb.cc | 98 +- files/source/scale_common.cc | 191 +- files/source/scale_gcc.cc | 5 +- files/source/scale_mmi.cc | 1168 ---- files/source/scale_neon.cc | 39 + files/source/scale_neon64.cc | 39 + files/source/scale_uv.cc | 142 +- files/tools_libyuv/autoroller/roll_deps.py | 582 +- files/unit_test/convert_test.cc | 762 ++- files/unit_test/cpu_test.cc | 146 +- files/unit_test/planar_test.cc | 97 +- files/unit_test/rotate_argb_test.cc | 106 + files/unit_test/rotate_test.cc | 363 ++ files/unit_test/scale_uv_test.cc | 79 +- files/unit_test/testdata/riscv64.txt | 4 + files/unit_test/testdata/riscv64_rvv.txt | 4 + files/unit_test/testdata/riscv64_rvv_zvfh.txt | 4 + files/unit_test/unit_test.cc | 5 + files/unit_test/unit_test.h | 15 +- files/util/cpuid.c | 60 +- files/util/yuvconstants.c | 11 +- files/util/yuvconvert.cc | 10 +- 84 files changed, 13621 insertions(+), 12613 deletions(-) create mode 100644 files/build_overrides/partition_alloc.gni create mode 100755 files/riscv_script/prepare_toolchain_qemu.sh create mode 100644 files/riscv_script/riscv-clang.cmake create mode 100755 files/riscv_script/run_qemu.sh delete mode 100644 files/source/compare_mmi.cc delete mode 100644 files/source/rotate_mmi.cc delete mode 100644 files/source/row_mmi.cc create mode 100644 files/source/row_rvv.cc delete mode 100644 files/source/scale_mmi.cc create mode 100644 files/unit_test/testdata/riscv64.txt create mode 100644 files/unit_test/testdata/riscv64_rvv.txt create mode 100644 files/unit_test/testdata/riscv64_rvv_zvfh.txt diff --git a/METADATA b/METADATA index bff062d8..5508de20 100644 --- a/METADATA +++ b/METADATA @@ -8,7 +8,7 @@ third_party { type: GIT value: "https://chromium.googlesource.com/libyuv/libyuv/" } - version: "d53f1beecdd8d959f7a3f2e19bd0bd7e7227a233" - last_upgrade_date { year: 2022 month: 8 day: 5 } + version: "2a6cb7431939faba1b40d3f08883847f0cf63572" + last_upgrade_date { year: 2023 month: 6 day: 1 } license_type: NOTICE } diff --git a/README.version b/README.version index 5deb188e..6eb9dc8c 100644 --- a/README.version +++ b/README.version @@ -1,4 +1,4 @@ -Version: r1837 +Version: r1871 BugComponent: 42195 Owner: lajos Local Modifications: diff --git a/files/.gn b/files/.gn index a765caa5..f9a5ee6c 100644 --- a/files/.gn +++ b/files/.gn @@ -34,7 +34,5 @@ exec_script_whitelist = build_dotfile_settings.exec_script_whitelist + default_args = { mac_sdk_min = "10.12" - - # https://bugs.chromium.org/p/libyuv/issues/detail?id=826 - ios_deployment_target = "10.0" + ios_deployment_target = "12.0" } diff --git a/files/.vpython3 b/files/.vpython3 index 0a9aa38b..28d819e7 100644 --- a/files/.vpython3 +++ b/files/.vpython3 @@ -76,8 +76,8 @@ wheel: < version: "version:5.8.0.chromium.2" > wheel: < - name: "infra/python/wheels/requests-py2_py3" - version: "version:2.26.0" + name: "infra/python/wheels/requests-py3" + version: "version:2.31.0" > # Used by various python unit tests. diff --git a/files/Android.bp b/files/Android.bp index 0c46f7f1..d02b56f3 100644 --- a/files/Android.bp +++ b/files/Android.bp @@ -62,6 +62,7 @@ cc_library { "source/row_msa.cc", "source/row_neon.cc", "source/row_neon64.cc", + "source/row_rvv.cc", "source/scale.cc", "source/scale_any.cc", "source/scale_argb.cc", diff --git a/files/BUILD.gn b/files/BUILD.gn index a72ff065..adaae9d8 100644 --- a/files/BUILD.gn +++ b/files/BUILD.gn @@ -6,6 +6,7 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. +import("//build/config/features.gni") import("//testing/test.gni") import("libyuv.gni") @@ -21,15 +22,19 @@ declare_args() { config("libyuv_config") { include_dirs = [ "include" ] - if (is_android && current_cpu == "arm64") { - ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ] - } - if (is_android && current_cpu != "arm64") { - ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ] + if (is_android) { + if (target_cpu == "arm" || target_cpu == "x86" || target_cpu == "mipsel") { + ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ] + } else { + ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ] + } } - + defines = [] if (!libyuv_use_neon) { - defines = [ "LIBYUV_DISABLE_NEON" ] + defines += [ "LIBYUV_DISABLE_NEON" ] + } + if (libyuv_disable_rvv) { + defines += [ "LIBYUV_DISABLE_RVV" ] } } @@ -129,6 +134,7 @@ static_library("libyuv_internal") { "source/row_any.cc", "source/row_common.cc", "source/row_gcc.cc", + "source/row_rvv.cc", "source/row_win.cc", "source/scale.cc", "source/scale_any.cc", @@ -150,7 +156,7 @@ static_library("libyuv_internal") { configs += [ "//build/config/gcc:symbol_visibility_default" ] } - if (!is_ios && !libyuv_disable_jpeg) { + if ((!is_ios || use_blink) && !libyuv_disable_jpeg) { defines += [ "HAVE_JPEG" ] # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps diff --git a/files/CMakeLists.txt b/files/CMakeLists.txt index d190507b..7a4a1994 100644 --- a/files/CMakeLists.txt +++ b/files/CMakeLists.txt @@ -4,7 +4,7 @@ PROJECT ( YUV C CXX ) # "C" is required even for C++ projects CMAKE_MINIMUM_REQUIRED( VERSION 2.8.12 ) -OPTION( TEST "Built unit tests" OFF ) +OPTION( UNIT_TEST "Built unit tests" OFF ) SET ( ly_base_dir ${PROJECT_SOURCE_DIR} ) SET ( ly_src_dir ${ly_base_dir}/source ) @@ -41,18 +41,24 @@ endif() ADD_EXECUTABLE ( yuvconvert ${ly_base_dir}/util/yuvconvert.cc ) TARGET_LINK_LIBRARIES ( yuvconvert ${ly_lib_static} ) +# this creates the yuvconstants tool +ADD_EXECUTABLE ( yuvconstants ${ly_base_dir}/util/yuvconstants.c ) +TARGET_LINK_LIBRARIES ( yuvconstants ${ly_lib_static} ) -INCLUDE ( FindJPEG ) +find_package ( JPEG ) if (JPEG_FOUND) include_directories( ${JPEG_INCLUDE_DIR} ) - target_link_libraries( yuvconvert ${JPEG_LIBRARY} ) + target_link_libraries( ${ly_lib_shared} ${JPEG_LIBRARY} ) add_definitions( -DHAVE_JPEG ) endif() -if(TEST) +if(UNIT_TEST) find_library(GTEST_LIBRARY gtest) if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND") set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources") + if (CMAKE_CROSSCOMPILING) + set(GTEST_SRC_DIR third_party/googletest/src/googletest) + endif() if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc) message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}") set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc) @@ -61,7 +67,7 @@ if(TEST) include_directories(${GTEST_SRC_DIR}/include) set(GTEST_LIBRARY gtest) else() - message(FATAL_ERROR "TEST is set but unable to find gtest library") + message(FATAL_ERROR "UNIT_TEST is set but unable to find gtest library") endif() endif() @@ -78,6 +84,12 @@ if(TEST) if(NACL AND NACL_LIBC STREQUAL "newlib") target_link_libraries(libyuv_unittest glibc-compat) endif() + + find_library(GFLAGS_LIBRARY gflags) + if(NOT GFLAGS_LIBRARY STREQUAL "GFLAGS_LIBRARY-NOTFOUND") + target_link_libraries(libyuv_unittest gflags) + add_definitions(-DLIBYUV_USE_GFLAGS) + endif() endif() diff --git a/files/DEPS b/files/DEPS index 3cf2dbe0..a7bec8d3 100644 --- a/files/DEPS +++ b/files/DEPS @@ -5,43 +5,62 @@ gclient_gn_args = [ vars = { 'chromium_git': 'https://chromium.googlesource.com', - 'chromium_revision': '829c6df33dce1085a61d8fd44209fc84bbf9a6a7', - 'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94', + 'chromium_revision': 'd1501576384de23ddf8d8815ee7c95be2f708de5', + 'gn_version': 'git_revision:e3978de3e8dafb50a2b11efa784e08699a43faf8', + # ninja CIPD package version. + # https://chrome-infra-packages.appspot.com/p/infra/3pp/tools/ninja + 'ninja_version': 'version:2@1.11.1.chromium.6', + # reclient CIPD package version + 'reclient_version': 're_client_version:0.107.1.0b39c4c-gomaip', # Keep the Chromium default of generating location tags. 'generate_location_tags': True, + + # By default, download the fuchsia sdk from the public sdk directory. + 'fuchsia_sdk_cipd_prefix': 'fuchsia/sdk/gn/', + 'fuchsia_version': 'version:12.20230530.1.1', + # By default, download the fuchsia images from the fuchsia GCS bucket. + 'fuchsia_images_bucket': 'fuchsia', + 'checkout_fuchsia': False, + # Since the images are hundreds of MB, default to only downloading the image + # most commonly useful for developers. Bots and developers that need to use + # other images can override this with additional images. + 'checkout_fuchsia_boot_images': "terminal.qemu-x64", + 'checkout_fuchsia_product_bundles': '"{checkout_fuchsia_boot_images}" != ""', } deps = { 'src/build': - Var('chromium_git') + '/chromium/src/build' + '@' + 'dcea3443035f48d58193788e0bc56daca4e5db33', + Var('chromium_git') + '/chromium/src/build' + '@' + 'd0c2b4cf4fdd43866e066fb6722099aa8bf4ce79', 'src/buildtools': - Var('chromium_git') + '/chromium/src/buildtools' + '@' + '075dd7e22837a69189003e4fa84499acf63188cf', + Var('chromium_git') + '/chromium/src/buildtools' + '@' + 'edbefcee3d2cc45cdb0c60c2b01b673f8ba728bc', 'src/testing': - Var('chromium_git') + '/chromium/src/testing' + '@' + 'f4e42be13265ec304b0f3085eee2b15f30f44077', + Var('chromium_git') + '/chromium/src/testing' + '@' + 'a13817e1ea0255a375d13aeb3bb2527bd528495b', 'src/third_party': - Var('chromium_git') + '/chromium/src/third_party' + '@' + '42c249feeb71bc0cd184849f0509aefef599343d', + Var('chromium_git') + '/chromium/src/third_party' + '@' + '824e26c9fcbd00fccf6cdb712f8f127aae133042', 'src/buildtools/linux64': { 'packages': [ { - 'package': 'gn/gn/linux-amd64', + 'package': 'gn/gn/linux-${{arch}}', 'version': Var('gn_version'), } ], 'dep_type': 'cipd', - 'condition': 'checkout_linux', + 'condition': 'host_os == "linux"', }, + 'src/buildtools/mac': { 'packages': [ { - 'package': 'gn/gn/mac-amd64', + 'package': 'gn/gn/mac-${{arch}}', 'version': Var('gn_version'), } ], 'dep_type': 'cipd', - 'condition': 'checkout_mac', + 'condition': 'host_os == "mac"', }, + 'src/buildtools/win': { 'packages': [ { @@ -50,43 +69,57 @@ deps = { } ], 'dep_type': 'cipd', - 'condition': 'checkout_win', + 'condition': 'host_os == "win"', + }, + + 'src/buildtools/reclient': { + 'packages': [ + { + 'package': 'infra/rbe/client/${{platform}}', + 'version': Var('reclient_version'), + } + ], + 'dep_type': 'cipd', }, 'src/buildtools/clang_format/script': - Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + '99876cacf78329e5f99c244dbe42ccd1654517a0', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + 'f97059df7f8b205064625cdb5f97b56668a125ef', 'src/buildtools/third_party/libc++/trunk': - Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '79a2e924d96e2fc1e4b937c42efd08898fa472d7', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + 'f8279b01085b800724f5c5629dc365b9f040dc53', 'src/buildtools/third_party/libc++abi/trunk': - Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '665b74f7d1b3bb295cd6ba7d8fcec1acd3d2ac84', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '5c8dbff7a4911fe1e0af0bc1628891e4187a3c90', 'src/buildtools/third_party/libunwind/trunk': - Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'f51a154281bdfe746c46c07cd4fb05be97f9441d', + Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'cd144ced35285edaa064a91561969e5b22c219b1', 'src/third_party/catapult': - Var('chromium_git') + '/catapult.git' + '@' + '75423c310eb303d28978be892fcf7b9c2c824909', + Var('chromium_git') + '/catapult.git' + '@' + '9f3ef9c2eae9b1adabde88efe5dcc438ba76e205', 'src/third_party/colorama/src': - Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8', + Var('chromium_git') + '/external/colorama.git' + '@' + '3de9f013df4b470069d03d250224062e8cf15c49', + 'src/third_party/cpu_features/src': { + 'url': Var('chromium_git') + '/external/github.com/google/cpu_features.git' + '@' + '936b9ab5515dead115606559502e3864958f7f6e', + 'condition': 'checkout_android', + }, 'src/third_party/depot_tools': - Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '2ffa1bde797a8127c0f72908d0bd74051fd65d0d', + Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '05ab73be51774f098eb580eda6e96a49e1010b1b', 'src/third_party/freetype/src': - Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'cff026d41599945498044d2f4dcc0e610ffb6929', + Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '80a507a6b8e3d2906ad2c8ba69329bd2fb2a85ef', 'src/third_party/googletest/src': - Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'e2f3978937c0244508135f126e2617a7734a68be', + Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'af29db7ec28d6df1c7f0f745186884091e602e07', 'src/third_party/harfbuzz-ng/src': - Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '64b29dbd5994a511acee69cb9b45ad650ef88359', + Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '8df5cdbcda495a582e72a7e2ce35d6106401edce', 'src/third_party/libjpeg_turbo': - Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '02959c3ee17abacfd1339ec22ea93301292ffd56', + Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'aa4075f116e4312537d0d3e9dbd5e31096539f94', 'src/third_party/nasm': - Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '9215e8e1d0fe474ffd3e16c1a07a0f97089e6224', + Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '7fc833e889d1afda72c06220e5bed8fb43b2e5ce', 'src/tools': - Var('chromium_git') + '/chromium/src/tools' + '@' + '198dc879529652b39ba6e223bcc0bcad5f1facd6', + Var('chromium_git') + '/chromium/src/tools' + '@' + '916dfffd61cbf61075c47d7b480425d7de1483fd', # libyuv-only dependencies (not present in Chromium). 'src/third_party/gtest-parallel': Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e', 'src/third_party/lss': { - 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '92a65a8f5d705d1928874420c8d0d15bde8c89e5', + 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + 'ce877209e11aa69dcfffbd53ef90ea1d07136521', 'condition': 'checkout_android or checkout_linux', }, @@ -101,14 +134,32 @@ deps = { 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/auto/src': { - 'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + 'fe67d853d6356943dc79541c892ab6d3e6a7b61a', - 'condition': 'checkout_android', + + 'src/third_party/kotlin_stdlib': { + 'packages': [ + { + 'package': 'chromium/third_party/kotlin_stdlib', + 'version': 'z4_AYYz2Tw5GKikuiDLTuxxf0NJVGLkC3CVcyiIpc-gC', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/kotlinc/current': { + 'packages': [ + { + 'package': 'chromium/third_party/kotlinc', + 'version': 'J3BAlA7yf4corBopDhlwuT9W4jR1Z9R55KD3BUTVldQC', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', }, + 'src/third_party/boringssl/src': - 'https://boringssl.googlesource.com/boringssl.git' + '@' + '3a667d10e94186fd503966f5638e134fe9fb4080', + 'https://boringssl.googlesource.com/boringssl.git' + '@' + 'dd5219451c3ce26221762a15d867edf43b463bb2', 'src/base': { - 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e9e639622449a893a1b5e32781d072cec08ead72', + 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'b4c5ce6cb1a7c90de3fdddc80ed439fe87eab443', 'condition': 'checkout_android', }, 'src/third_party/bazel': { @@ -132,19 +183,21 @@ deps = { 'dep_type': 'cipd', }, 'src/third_party/android_ndk': { - 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '401019bf85744311b26c88ced255cd53401af8b7', + 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '310956bd122ec2b96049f8d7398de6b717f3452e', 'condition': 'checkout_android', }, + 'src/third_party/androidx': { 'packages': [ { 'package': 'chromium/third_party/androidx', - 'version': '6d8ij5pzYh29WWjPbdbAWFBJSA1nUgkWf2p6wCVZKIsC', + 'version': 'Wr5b9WJiFAzJcmjmvQIePIxk5IgpDl62kaGY_SiLxJEC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_support_test_runner': { 'packages': [ { @@ -158,16 +211,12 @@ deps = { 'src/third_party/android_sdk/public': { 'packages': [ { - 'package': 'chromium/third_party/android_sdk/public/build-tools/31.0.0', - 'version': 'tRoD45SCi7UleQqSV7MrMQO1_e5P8ysphkCcj6z_cCQC', + 'package': 'chromium/third_party/android_sdk/public/build-tools/33.0.0', + 'version': '-VRKr36Uw8L_iFqqo9nevIBgNMggND5iWxjidyjnCgsC', }, { 'package': 'chromium/third_party/android_sdk/public/emulator', - 'version': 'gMHhUuoQRKfxr-MBn3fNNXZtkAVXtOwMwT7kfx8jkIgC', - }, - { - 'package': 'chromium/third_party/android_sdk/public/extras', - 'version': 'ppQ4TnqDvBHQ3lXx5KPq97egzF5X2FFyOrVHkGmiTMQC', + 'version': '9lGp8nTUCRRWGMnI_96HcKfzjnxEJKUcfvfwmA3wXNkC', }, { 'package': 'chromium/third_party/android_sdk/public/patcher', @@ -175,11 +224,15 @@ deps = { }, { 'package': 'chromium/third_party/android_sdk/public/platform-tools', - 'version': 'g7n_-r6yJd_SGRklujGB1wEt8iyr77FZTUJVS9w6O34C', + 'version': 'RSI3iwryh7URLGRgJHsCvUxj092woTPnKt4pwFcJ6L8C', }, { - 'package': 'chromium/third_party/android_sdk/public/platforms/android-31', - 'version': 'lL3IGexKjYlwjO_1Ga-xwxgwbE_w-lmi2Zi1uOlWUIAC', + 'package': 'chromium/third_party/android_sdk/public/platforms/android-33', + 'version': 'eo5KvW6UVor92LwZai8Zulc624BQZoCu-yn7wa1z_YcC', + }, + { + 'package': 'chromium/third_party/android_sdk/public/platforms/android-tiramisuprivacysandbox', + 'version': 'YWMYkzyxGBgVsty0GhXL1oxbY0pGXQIgFc0Rh7ZMRPYC', }, { 'package': 'chromium/third_party/android_sdk/public/sources/android-31', @@ -187,7 +240,7 @@ deps = { }, { 'package': 'chromium/third_party/android_sdk/public/cmdline-tools', - 'version': 'Ez2NWws2SJYCF6qw2O-mSCqK6424l3ZdSTpppLyVR_cC', + 'version': 'EWnL2r7oV5GtE9Ef7GyohyFam42wtMtEKYU4dCb3U1YC', }, ], 'condition': 'checkout_android', @@ -207,7 +260,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/android_build_tools/aapt2', - 'version': 'version:3.6.0-alpha03-5516695-cr0', + 'version': 'STY0BXlZxsEhudnlXQFed-B5UpwehcoM0sYqor6qRqsC', }, ], 'condition': 'checkout_android', @@ -223,6 +276,16 @@ deps = { 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/byte_buddy/android_sdk_build_tools_25_0_2': { + 'packages': [ + { + 'package': 'chromium/third_party/android_sdk/public/build-tools', + 'version': 'kwIs2vdfTm93yEP8LG5aSnchN4BVEdVxbqQtF4XpPdkC', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, 'src/third_party/ced/src': { 'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5', 'condition': 'checkout_android', @@ -267,7 +330,7 @@ deps = { }, 'src/third_party/icu': { - 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'bf66d373ae781a3498f2babe7b61d933dd774b82', + 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'a2961dc659b4ae847a9c6120718cc2517ee57d9e', }, 'src/third_party/icu4j': { 'packages': [ @@ -293,11 +356,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/jdk', - 'version': 'PfRSnxe8Od6WU4zBXomq-zsgcJgWmm3z4gMQNB-r2QcC', - }, - { - 'package': 'chromium/third_party/jdk/extras', - 'version': 'fkhuOQ3r-zKtWEdKplpo6k0vKkjl-LY_rJTmtzFCQN4C', + 'version': '2Of9Pe_OdO4xoAATuiLDiMVNebKTNO3WrwJGqil4RosC', }, ], 'condition': 'checkout_android', @@ -308,22 +367,31 @@ deps = { 'condition': 'checkout_android', }, 'src/third_party/junit/src': { - 'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481', + 'url': Var('chromium_git') + '/external/junit.git' + '@' + '05fe2a64f59127c02135be22f416e91260d6ede6', 'condition': 'checkout_android', }, 'src/third_party/libunwindstack': { - 'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '6868358481bb1e5e20d155c1084dc436c88b5e6b', + 'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '4dbfa0e8c844c8e243b297bc185e54a99ff94f9e', 'condition': 'checkout_android', }, + 'src/third_party/ninja': { + 'packages': [ + { + 'package': 'infra/3pp/tools/ninja/${{platform}}', + 'version': Var('ninja_version'), + } + ], + 'dep_type': 'cipd', + }, 'src/third_party/mockito/src': { - 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac', + 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '7c3641bcef717ffa7d765f2c86b847d0aab1aac9', 'condition': 'checkout_android', }, 'src/third_party/objenesis': { 'packages': [ { 'package': 'chromium/third_party/objenesis', - 'version': '9e367f55e5a65781ee77bfcbaa88fb82b30e75c0', + 'version': 'tknDblENYi8IaJYyD6tUahUyHYZlzJ_Y74_QZSz4DpIC', }, ], 'condition': 'checkout_android', @@ -343,7 +411,20 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/r8', - 'version': 'Nu_mvQJe34CotIXadFlA3w732CJ9EvQGuVs4udcZedAC', + 'version': '4Oq32DG2vuDh7Frxj6tH5xyi77sVgBWpvvl4hwvZRR4C', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + # This duplication is intentional, so we avoid updating the r8.jar used by + # dexing unless necessary, since each update invalidates all incremental + # dexing and unnecessarily slows down all bots. + 'src/third_party/r8/d8': { + 'packages': [ + { + 'package': 'chromium/third_party/r8', + 'version': 'PwglNZFRNPkBBXdnY9NfrZFk2ULWDTRxhV9rl2kvkpUC', }, ], 'condition': 'checkout_android', @@ -367,7 +448,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/robolectric', - 'version': 'iC6RDM5EH3GEAzR-1shW_Mg0FeeNE5shq1okkFfuuNQC', + 'version': 'hzetqh1qFI32FOgQroZvGcGdomrgVBJ6WKRnl1KFw6EC', }, ], 'condition': 'checkout_android', @@ -377,7 +458,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/sqlite4java', - 'version': '889660698187baa7c8b0d79f7bf58563125fbd66', + 'version': 'LofjKH9dgXIAJhRYCPQlMFywSwxYimrfDeBmaHc-Z5EC', }, ], 'condition': 'checkout_android', @@ -387,7 +468,7 @@ deps = { 'packages': [ { 'package': 'chromium/third_party/turbine', - 'version': 'Om6yIEXgJxuqghErK29h9RcMH6VaymMbxwScwXmcN6EC', + 'version': 'Foa7uRpVoKr4YoayCKc9EERkjpmGOE3DAUTWFLL7gKEC', }, ], 'condition': 'checkout_android', @@ -400,1718 +481,1822 @@ deps = { # iOS deps: 'src/ios': { - 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '81826d980c159f949c2c7901f4dbec9a09788964', + 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '241921896b64f85de9a32d461462913cbff4baeb', 'condition': 'checkout_ios' }, # Everything coming after this is automatically updated by the auto-roller. # === ANDROID_DEPS Generated Code Start === - + # Generated by //third_party/android_deps/fetch_all.py 'src/third_party/android_deps/libs/android_arch_core_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_core_common', - 'version': 'version:2@1.1.1.cr0', + 'version': 'version:2@1.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/android_arch_core_runtime': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime', - 'version': 'version:2@1.1.1.cr0', + 'version': 'version:2@1.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/android_arch_lifecycle_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common', - 'version': 'version:2@1.1.1.cr0', + 'version': 'version:2@1.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8', - 'version': 'version:2@1.1.1.cr0', + 'version': 'version:2@1.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata', - 'version': 'version:2@1.1.1.cr0', + 'version': 'version:2@1.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core', - 'version': 'version:2@1.1.1.cr0', + 'version': 'version:2@1.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime', - 'version': 'version:2@1.1.1.cr0', + 'version': 'version:2@1.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel', - 'version': 'version:2@1.1.1.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent', - 'version': 'version:2@3.1.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/classworlds_classworlds': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/classworlds_classworlds', - 'version': 'version:2@1.1-alpha-2.cr0', + 'version': 'version:2@1.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_appcompat_v7': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_cardview_v7': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_collections': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_collections', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_cursoradapter': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_customview': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_customview', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_design': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_design', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_documentfile': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_drawerlayout': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_interpolator': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_loader': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_loader', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_multidex': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex', - 'version': 'version:2@1.0.0.cr0', + 'version': 'version:2@1.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_print': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_print', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_support_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_support_compat': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_support_core_ui': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_support_core_utils': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_support_fragment': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_support_media_compat': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_support_v4': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_transition': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_transition', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_versionedparcelable': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_support_viewpager': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager', - 'version': 'version:2@28.0.0.cr0', + 'version': 'version:2@28.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_android_tools_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_tools_common', - 'version': 'version:2@30.0.0-alpha10.cr0', + 'version': 'version:2@30.2.0-beta01.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs': { + + 'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs', - 'version': 'version:2@1.1.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api', + 'version': 'version:2@30.2.0-beta01.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration': { + + 'src/third_party/android_deps/libs/com_android_tools_sdk_common': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration', - 'version': 'version:2@1.1.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common', + 'version': 'version:2@30.2.0-beta01.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': { + + 'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api', - 'version': 'version:2@30.0.0-alpha10.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine', + 'version': 'version:2@2.8.8.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_android_tools_sdk_common': { + + 'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common', - 'version': 'version:2@30.0.0-alpha10.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms', + 'version': 'version:2@1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': { + + 'src/third_party/android_deps/libs/com_google_android_annotations': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine', - 'version': 'version:2@2.8.8.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_google_android_annotations', + 'version': 'version:2@4.1.1.4.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': { + + 'src/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms', - 'version': 'version:2@1.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_google_android_apps_common_testing_accessibility_framework_accessibility_test_framework', + 'version': 'version:2@4.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_datatransport_transport_api': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_datatransport_transport_api', - 'version': 'version:2@2.2.1.cr0', + 'version': 'version:2@2.2.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@20.1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone', - 'version': 'version:2@17.5.0.cr0', + 'version': 'version:2@18.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@18.0.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base', - 'version': 'version:2@17.5.0.cr0', + 'version': 'version:2@18.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement', - 'version': 'version:2@17.5.0.cr0', + 'version': 'version:2@18.1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging', - 'version': 'version:2@16.0.0.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido', - 'version': 'version:2@19.0.0-beta.cr0', + 'version': 'version:2@16.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@18.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@19.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks', - 'version': 'version:2@17.2.0.cr0', + 'version': 'version:2@18.0.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision', - 'version': 'version:2@18.0.0.cr0', + 'version': 'version:2@20.1.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common', - 'version': 'version:2@18.0.0.cr0', + 'version': 'version:2@19.1.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_android_material_material': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material', - 'version': 'version:2@1.6.0-alpha01.cr0', + 'version': 'version:2@1.7.0-alpha02.cr1', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/com_google_android_play_core_common': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core_common', + 'version': 'version:2@2.0.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/com_google_android_play_core': { + + 'src/third_party/android_deps/libs/com_google_android_play_feature_delivery': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core', - 'version': 'version:2@1.10.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_google_android_play_feature_delivery', + 'version': 'version:2@2.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_auto_auto_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common', - 'version': 'version:2@1.1.2.cr0', + 'version': 'version:2@1.2.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_auto_service_auto_service': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service', - 'version': 'version:2@1.0-rc6.cr0', + 'version': 'version:2@1.0-rc6.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations', - 'version': 'version:2@1.0-rc6.cr0', + 'version': 'version:2@1.0-rc6.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations', - 'version': 'version:2@1.7.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/com_google_code_findbugs_jformatstring': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jformatstring', - 'version': 'version:2@3.0.0.cr0', + 'version': 'version:2@1.10.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305', - 'version': 'version:2@3.0.2.cr0', + 'version': 'version:2@3.0.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_code_gson_gson': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson', - 'version': 'version:2@2.8.0.cr0', + 'version': 'version:2@2.9.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_dagger_dagger': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger', - 'version': 'version:2@2.30.cr0', + 'version': 'version:2@2.30.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler', - 'version': 'version:2@2.30.cr0', + 'version': 'version:2@2.30.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers', - 'version': 'version:2@2.30.cr0', + 'version': 'version:2@2.30.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi', - 'version': 'version:2@2.30.cr0', + 'version': 'version:2@2.30.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation', - 'version': 'version:2@2.10.0.cr0', + 'version': 'version:2@2.11.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations', - 'version': 'version:2@2.10.0.cr0', + 'version': 'version:2@2.18.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api', - 'version': 'version:2@2.10.0.cr0', + 'version': 'version:2@2.11.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core', - 'version': 'version:2@2.10.0.cr0', + 'version': 'version:2@2.11.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations', - 'version': 'version:2@2.10.0.cr0', + 'version': 'version:2@2.11.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_errorprone_javac': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac', - 'version': 'version:2@9+181-r4173-1.cr0', + 'version': 'version:2@9+181-r4173-1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded', - 'version': 'version:2@9-dev-r4023-3.cr0', + 'version': 'version:2@9-dev-r4023-3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_annotations', - 'version': 'version:2@16.0.0.cr0', + 'version': 'version:2@16.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_common', - 'version': 'version:2@19.5.0.cr0', + 'version': 'version:2@19.5.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_components': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_components', - 'version': 'version:2@16.1.0.cr0', + 'version': 'version:2@16.1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders', - 'version': 'version:2@16.1.0.cr0', + 'version': 'version:2@16.1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json', - 'version': 'version:2@17.1.0.cr0', + 'version': 'version:2@17.1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_iid': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid', - 'version': 'version:2@21.0.1.cr0', + 'version': 'version:2@21.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop', - 'version': 'version:2@17.0.0.cr0', + 'version': 'version:2@17.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_installations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations', - 'version': 'version:2@16.3.5.cr0', + 'version': 'version:2@16.3.5.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop', - 'version': 'version:2@16.0.1.cr0', + 'version': 'version:2@16.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector', - 'version': 'version:2@18.0.0.cr0', + 'version': 'version:2@18.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_firebase_firebase_messaging': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_messaging', - 'version': 'version:2@21.0.1.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/com_google_flatbuffers_flatbuffers_java': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/com_google_flatbuffers_flatbuffers_java', - 'version': 'version:2@2.0.3.cr0', + 'version': 'version:2@21.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format', - 'version': 'version:2@1.5.cr0', + 'version': 'version:2@1.5.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_guava_failureaccess': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess', - 'version': 'version:2@1.0.1.cr0', + 'version': 'version:2@1.0.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_guava_guava': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava', - 'version': 'version:2@31.0-jre.cr0', + 'version': 'version:2@31.1-jre.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_guava_guava_android': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava_android', - 'version': 'version:2@31.0-android.cr0', + 'version': 'version:2@31.1-android.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_guava_listenablefuture': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture', - 'version': 'version:2@1.0.cr0', + 'version': 'version:2@1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations', - 'version': 'version:2@1.3.cr0', + 'version': 'version:2@1.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java', - 'version': 'version:2@3.4.0.cr0', + 'version': 'version:2@3.19.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite', - 'version': 'version:2@3.13.0.cr0', + 'version': 'version:2@3.21.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils', - 'version': 'version:2@1.3.0.cr0', + 'version': 'version:2@1.3.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_squareup_javapoet': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet', - 'version': 'version:2@1.13.0.cr0', + 'version': 'version:2@1.13.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/com_squareup_javawriter': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter', - 'version': 'version:2@2.1.1.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils', - 'version': 'version:2@4.0.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api', - 'version': 'version:2@1.3.2.cr0', + 'version': 'version:2@2.1.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/javax_annotation_jsr250_api': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api', - 'version': 'version:2@1.0.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/javax_inject_javax_inject': { - 'packages': [ - { - 'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject', - 'version': 'version:2@1.cr0', - }, - ], - 'condition': 'checkout_android', - 'dep_type': 'cipd', - }, - 'src/third_party/android_deps/libs/nekohtml_nekohtml': { + + 'src/third_party/android_deps/libs/com_squareup_okio_okio_jvm': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/nekohtml_nekohtml', - 'version': 'version:2@1.9.6.2.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_squareup_okio_okio_jvm', + 'version': 'version:2@3.0.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/nekohtml_xercesminimal': { + + 'src/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/nekohtml_xercesminimal', - 'version': 'version:2@1.9.6.2.cr0', + 'package': 'chromium/third_party/android_deps/libs/com_squareup_wire_wire_runtime_jvm', + 'version': 'version:2@4.5.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': { + + 'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap', - 'version': 'version:2@0.2.cr0', + 'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils', + 'version': 'version:2@4.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/net_sf_kxml_kxml2': { + + 'src/third_party/android_deps/libs/io_grpc_grpc_api': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/net_sf_kxml_kxml2', - 'version': 'version:2@2.3.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_api', + 'version': 'version:2@1.49.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_ant_ant': { + + 'src/third_party/android_deps/libs/io_grpc_grpc_binder': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant', - 'version': 'version:2@1.8.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_binder', + 'version': 'version:2@1.49.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_ant_ant_launcher': { + + 'src/third_party/android_deps/libs/io_grpc_grpc_context': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant_launcher', - 'version': 'version:2@1.8.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_context', + 'version': 'version:2@1.49.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks': { + + 'src/third_party/android_deps/libs/io_grpc_grpc_core': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks', - 'version': 'version:2@2.1.3.cr0', + 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_core', + 'version': 'version:2@1.49.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_artifact': { + + 'src/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_protobuf_lite', + 'version': 'version:2@1.49.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager': { + + 'src/third_party/android_deps/libs/io_grpc_grpc_stub': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/io_grpc_grpc_stub', + 'version': 'version:2@1.49.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics': { + + 'src/third_party/android_deps/libs/io_perfmark_perfmark_api': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/io_perfmark_perfmark_api', + 'version': 'version:2@0.25.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_model': { + + 'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_model', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api', + 'version': 'version:2@1.3.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry': { + + 'src/third_party/android_deps/libs/javax_annotation_jsr250_api': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api', + 'version': 'version:2@1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_profile': { + + 'src/third_party/android_deps/libs/javax_inject_javax_inject': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_profile', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject', + 'version': 'version:2@1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_project': { + + 'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_project', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy', + 'version': 'version:2@1.14.4.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata': { + + 'src/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/net_bytebuddy_byte_buddy_agent', + 'version': 'version:2@1.14.4.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_maven_settings': { + + 'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_settings', - 'version': 'version:2@2.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap', + 'version': 'version:2@0.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file': { + + 'src/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file', - 'version': 'version:2@1.0-beta-6.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_bouncycastle_bcprov_jdk18on', + 'version': 'version:2@1.72.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight': { + + 'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight', - 'version': 'version:2@1.0-beta-6.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup', + 'version': 'version:2@1.2.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared': { + + 'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared', - 'version': 'version:2@1.0-beta-6.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual', + 'version': 'version:2@2.5.5.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api': { + + 'src/third_party/android_deps/libs/org_checkerframework_checker_qual': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api', - 'version': 'version:2@1.0-beta-6.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual', + 'version': 'version:2@3.25.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': { + + 'src/third_party/android_deps/libs/org_checkerframework_checker_util': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup', - 'version': 'version:2@1.2.1.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_util', + 'version': 'version:2@3.25.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': { + + 'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual', - 'version': 'version:2@2.5.5.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone', + 'version': 'version:2@3.15.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_checkerframework_checker_qual': { + + 'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual', - 'version': 'version:2@3.12.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations', + 'version': 'version:2@1.21.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': { + + 'src/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone', - 'version': 'version:2@3.15.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_conscrypt_conscrypt_openjdk_uber', + 'version': 'version:2@2.5.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': { + + 'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations', - 'version': 'version:2@1.17.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit', + 'version': 'version:2@4.4.1.201607150455-r.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default': { + + 'src/third_party/android_deps/libs/org_hamcrest_hamcrest': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default', - 'version': 'version:2@1.0-alpha-9-stable-1.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_hamcrest_hamcrest', + 'version': 'version:2@2.2.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation': { + + 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation', - 'version': 'version:2@1.11.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7', + 'version': 'version:2@1.8.20.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils': { + + 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils', - 'version': 'version:2@1.5.15.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8', + 'version': 'version:2@1.8.20.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': { + + 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit', - 'version': 'version:2@4.4.1.201607150455-r.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android', + 'version': 'version:2@1.6.4.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_jetbrains_annotations': { + + 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_annotations', - 'version': 'version:2@13.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm', + 'version': 'version:2@1.6.4.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib': { + + 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib', - 'version': 'version:2@1.6.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_guava', + 'version': 'version:2@1.6.4.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common': { + + 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common', - 'version': 'version:2@1.6.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm', + 'version': 'version:2@0.1.0.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': { + + 'src/third_party/android_deps/libs/org_jsoup_jsoup': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7', - 'version': 'version:2@1.5.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_jsoup_jsoup', + 'version': 'version:2@1.15.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': { + + 'src/third_party/android_deps/libs/org_mockito_mockito_android': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8', - 'version': 'version:2@1.5.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_android', + 'version': 'version:2@5.3.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': { + + 'src/third_party/android_deps/libs/org_mockito_mockito_core': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android', - 'version': 'version:2@1.5.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_core', + 'version': 'version:2@5.3.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': { + + 'src/third_party/android_deps/libs/org_mockito_mockito_subclass': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm', - 'version': 'version:2@1.5.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_mockito_mockito_subclass', + 'version': 'version:2@5.3.1.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, - 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': { + + 'src/third_party/android_deps/libs/org_objenesis_objenesis': { 'packages': [ { - 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm', - 'version': 'version:2@0.1.0.cr0', + 'package': 'chromium/third_party/android_deps/libs/org_objenesis_objenesis', + 'version': 'version:2@3.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_ow2_asm_asm': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm', - 'version': 'version:2@7.0.cr0', + 'version': 'version:2@9.5.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis', - 'version': 'version:2@7.0.cr0', + 'version': 'version:2@9.5.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons', - 'version': 'version:2@7.0.cr0', + 'version': 'version:2@9.5.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree', - 'version': 'version:2@7.0.cr0', + 'version': 'version:2@9.5.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_ow2_asm_asm_util': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util', - 'version': 'version:2@7.0.cr0', + 'version': 'version:2@9.5.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_pcollections_pcollections': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections', - 'version': 'version:2@2.1.2.cr0', + 'version': 'version:2@3.1.4.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_junit': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + + 'src/third_party/android_deps/libs/org_robolectric_nativeruntime': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + + 'src/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat': { + 'packages': [ + { + 'package': 'chromium/third_party/android_deps/libs/org_robolectric_nativeruntime_dist_compat', + 'version': 'version:2@1.0.1.cr1', + }, + ], + 'condition': 'checkout_android', + 'dep_type': 'cipd', + }, + 'src/third_party/android_deps/libs/org_robolectric_pluginapi': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_resources': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_robolectric': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_sandbox': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_shadowapi': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_shadows_framework': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_utils': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, + 'src/third_party/android_deps/libs/org_robolectric_utils_reflector': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector', - 'version': 'version:2@4.3.1.cr0', + 'version': 'version:2@4.10.3.cr1', }, ], 'condition': 'checkout_android', @@ -2197,29 +2382,74 @@ hooks = [ 'condition': 'checkout_mac', }, { - 'name': 'msan_chained_origins', + 'name': 'msan_chained_origins_focal', + 'pattern': '.', + 'condition': 'checkout_instrumented_libraries', + 'action': [ 'python3', + 'src/third_party/depot_tools/download_from_google_storage.py', + '--no_resume', + '--no_auth', + '--bucket', 'chromium-instrumented-libraries', + '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1', + ], + }, + { + 'name': 'msan_no_origins_focal', + 'pattern': '.', + 'condition': 'checkout_instrumented_libraries', + 'action': [ 'python3', + 'src/third_party/depot_tools/download_from_google_storage.py', + '--no_resume', + '--no_auth', + '--bucket', 'chromium-instrumented-libraries', + '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1', + ], + }, + { + 'name': 'msan_chained_origins_focal', 'pattern': '.', 'condition': 'checkout_instrumented_libraries', 'action': [ 'python3', 'src/third_party/depot_tools/download_from_google_storage.py', - "--no_resume", - "--no_auth", - "--bucket", "chromium-instrumented-libraries", - "-s", "src/third_party/instrumented_libraries/binaries/msan-chained-origins.tgz.sha1", + '--no_resume', + '--no_auth', + '--bucket', 'chromium-instrumented-libraries', + '-s', 'src/third_party/instrumented_libraries/binaries/msan-chained-origins-focal.tgz.sha1', ], }, { - 'name': 'msan_no_origins', + 'name': 'msan_no_origins_focal', 'pattern': '.', 'condition': 'checkout_instrumented_libraries', 'action': [ 'python3', 'src/third_party/depot_tools/download_from_google_storage.py', - "--no_resume", - "--no_auth", - "--bucket", "chromium-instrumented-libraries", - "-s", "src/third_party/instrumented_libraries/binaries/msan-no-origins.tgz.sha1", + '--no_resume', + '--no_auth', + '--bucket', 'chromium-instrumented-libraries', + '-s', 'src/third_party/instrumented_libraries/binaries/msan-no-origins-focal.tgz.sha1', ], }, + { + 'name': 'Download Fuchsia SDK from GCS', + 'pattern': '.', + 'condition': 'checkout_fuchsia', + 'action': [ + 'python3', + 'src/build/fuchsia/update_sdk.py', + '--cipd-prefix={fuchsia_sdk_cipd_prefix}', + '--version={fuchsia_version}', + ], + }, + { + 'name': 'Download Fuchsia system images', + 'pattern': '.', + 'condition': 'checkout_fuchsia and checkout_fuchsia_product_bundles', + 'action': [ + 'python3', + 'src/build/fuchsia/update_product_bundles.py', + '{checkout_fuchsia_boot_images}', + ], + }, { # Pull clang if needed or requested via GYP_DEFINES. # Note: On Win, this should run after win_toolchain, as it may use it. @@ -2238,7 +2468,9 @@ hooks = [ { 'name': 'clang_format_win', 'pattern': '.', - 'action': [ 'download_from_google_storage', + 'condition': 'host_os == "win"', + 'action': [ 'python3', + 'src/third_party/depot_tools/download_from_google_storage.py', '--no_resume', '--platform=win32', '--no_auth', @@ -2247,21 +2479,38 @@ hooks = [ ], }, { - 'name': 'clang_format_mac', + 'name': 'clang_format_mac_x64', 'pattern': '.', - 'action': [ 'download_from_google_storage', + 'condition': 'host_os == "mac" and host_cpu == "x64"', + 'action': [ 'python3', + 'src/third_party/depot_tools/download_from_google_storage.py', '--no_resume', '--platform=darwin', '--no_auth', '--bucket', 'chromium-clang-format', - '-s', 'src/buildtools/mac/clang-format.sha1', + '-s', 'src/buildtools/mac/clang-format.x64.sha1', + '-o', 'src/buildtools/mac/clang-format', ], }, + { + 'name': 'clang_format_mac_arm64', + 'pattern': '.', + 'condition': 'host_os == "mac" and host_cpu == "arm64"', + 'action': [ 'python3', + 'src/third_party/depot_tools/download_from_google_storage.py', + '--no_resume', + '--no_auth', + '--bucket', 'chromium-clang-format', + '-s', 'src/buildtools/mac/clang-format.arm64.sha1', + '-o', 'src/buildtools/mac/clang-format', + ], + }, { 'name': 'clang_format_linux', 'pattern': '.', 'condition': 'host_os == "linux"', - 'action': [ 'download_from_google_storage', + 'action': [ 'python3', + 'src/third_party/depot_tools/download_from_google_storage.py', '--no_resume', '--platform=linux*', '--no_auth', @@ -2303,18 +2552,6 @@ hooks = [ '-d', 'src/tools/luci-go/linux64', ], }, - { - # We used to use src as a CIPD root. We moved it to a different directory - # in crrev.com/c/930178 but left the clobber here to ensure that that CL - # could be reverted safely. This can be safely removed once crbug.com/794764 - # is resolved. - 'name': 'Android Clobber Deprecated CIPD Root', - 'pattern': '.', - 'condition': 'checkout_android', - 'action': ['src/build/cipd/clobber_cipd_root.py', - '--root', 'src', - ], - }, { 'name': 'Generate component metadata for tests', 'pattern': '.', diff --git a/files/README.chromium b/files/README.chromium index 3f68e21e..880191e4 100644 --- a/files/README.chromium +++ b/files/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1837 +Version: 1871 License: BSD License File: LICENSE diff --git a/files/README.md b/files/README.md index db70b7f0..95eeb04c 100644 --- a/files/README.md +++ b/files/README.md @@ -7,6 +7,7 @@ * Optimized for SSSE3/AVX2 on x86/x64. * Optimized for Neon on Arm. * Optimized for MSA on Mips. +* Optimized for RVV on RISC-V. ### Development diff --git a/files/build_overrides/build.gni b/files/build_overrides/build.gni index c8490313..d9d01d51 100644 --- a/files/build_overrides/build.gni +++ b/files/build_overrides/build.gni @@ -13,6 +13,9 @@ build_with_chromium = false # Some non-Chromium builds don't support building java targets. enable_java_templates = true +# Enables assertions on safety checks in libc++. +enable_safe_libcxx = true + # Allow using custom suppressions files (currently not used by libyuv). asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc" lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc" diff --git a/files/build_overrides/partition_alloc.gni b/files/build_overrides/partition_alloc.gni new file mode 100644 index 00000000..dcf8ac2d --- /dev/null +++ b/files/build_overrides/partition_alloc.gni @@ -0,0 +1,17 @@ +# Copyright 2022 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +# Use default values for PartitionAlloc as standalone library from +# base/allocator/partition_allocator/build_overrides/partition_alloc.gni +use_partition_alloc_as_malloc_default = false +use_allocator_shim_default = false +enable_backup_ref_ptr_support_default = false +enable_mte_checked_ptr_support_default = false +put_ref_count_in_previous_slot_default = false +enable_backup_ref_ptr_slow_checks_default = false +enable_dangling_raw_ptr_checks_default = false diff --git a/files/docs/environment_variables.md b/files/docs/environment_variables.md index dd5d59fb..4eb09659 100644 --- a/files/docs/environment_variables.md +++ b/files/docs/environment_variables.md @@ -40,6 +40,9 @@ By default the cpu is detected and the most advanced form of SIMD is used. But LIBYUV_DISABLE_LSX LIBYUV_DISABLE_LASX +## RISCV CPUs + LIBYUV_DISABLE_RVV + # Test Width/Height/Repeat The unittests default to a small image (128x72) to run fast. This can be set by environment variable to test a specific resolutions. diff --git a/files/docs/getting_started.md b/files/docs/getting_started.md index 15b19ab2..b19f0009 100644 --- a/files/docs/getting_started.md +++ b/files/docs/getting_started.md @@ -220,6 +220,35 @@ Install cmake: http://www.cmake.org/ make -j4 make package +## Building RISC-V target with cmake + +### Prerequisite: build risc-v clang toolchain and qemu + +If you don't have prebuilt clang and riscv64 qemu, run the script to download source and build them. + + ./riscv_script/prepare_toolchain_qemu.sh + +After running script, clang & qemu are built in `build-toolchain-qemu/riscv-clang/` & `build-toolchain-qemu/riscv-qemu/`. + +### Cross-compile for RISC-V target + cmake -B out/Release/ -DUNIT_TEST=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE="./riscv_script/riscv-clang.cmake" \ + -DTOOLCHAIN_PATH={TOOLCHAIN_PATH} \ + -DUSE_RVV=ON . + cmake --build out/Release/ + + +### Run on QEMU + +#### Run libyuv_unittest on QEMU + cd out/Release/ + USE_RVV=ON \ + TOOLCHAIN_PATH={TOOLCHAIN_PATH} \ + QEMU_PREFIX_PATH={QEMU_PREFIX_PATH} \ + ../../riscv_script/run_qemu.sh libyuv_unittest + + ## Setup for Arm Cross compile See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h index 46d37159..88619a4f 100644 --- a/files/include/libyuv/convert.h +++ b/files/include/libyuv/convert.h @@ -151,6 +151,33 @@ int MM21ToI420(const uint8_t* src_y, int width, int height); +// Convert MM21 to YUY2 +LIBYUV_API +int MM21ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); + +// Convert MT2T to P010 +// Note that src_y and src_uv point to packed 10-bit values, so the Y plane will +// be 10 / 8 times the dimensions of the image. Also for this reason, +// src_stride_y and src_stride_uv are given in bytes. +LIBYUV_API +int MT2TToP010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Convert I422 to NV21. LIBYUV_API int I422ToNV21(const uint8_t* src_y, @@ -272,6 +299,23 @@ int I210ToI422(const uint16_t* src_y, int width, int height); +#define H410ToH420 I410ToI420 +LIBYUV_API +int I410ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + #define H410ToH444 I410ToI444 LIBYUV_API int I410ToI444(const uint16_t* src_y, @@ -323,6 +367,23 @@ int I212ToI422(const uint16_t* src_y, int width, int height); +#define H212ToH420 I212ToI420 +LIBYUV_API +int I212ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + #define H412ToH444 I412ToI444 LIBYUV_API int I412ToI444(const uint16_t* src_y, @@ -340,6 +401,23 @@ int I412ToI444(const uint16_t* src_y, int width, int height); +#define H412ToH420 I412ToI420 +LIBYUV_API +int I412ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + #define I412ToI012 I410ToI010 #define H410ToH010 I410ToI010 #define H412ToH012 I410ToI010 @@ -560,6 +638,36 @@ int NV16ToNV24(const uint8_t* src_y, int width, int height); +// Convert P010 to I010. +LIBYUV_API +int P010ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert P012 to I012. +LIBYUV_API +int P012ToI012(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + // Convert P010 to P410. LIBYUV_API int P010ToP410(const uint16_t* src_y, @@ -677,6 +785,21 @@ int ARGBToI420(const uint8_t* src_argb, int width, int height); +// Convert ARGB to I420 with Alpha +LIBYUV_API +int ARGBToI420Alpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height); + // BGRA little endian (argb in memory) to I420. LIBYUV_API int BGRAToI420(const uint8_t* src_bgra, diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h index f66d20ce..35eeac9b 100644 --- a/files/include/libyuv/convert_argb.h +++ b/files/include/libyuv/convert_argb.h @@ -67,6 +67,8 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I012ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ + I012ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ @@ -404,6 +406,32 @@ int U444ToABGR(const uint8_t* src_y, int width, int height); +// Convert I444 to RGB24. +LIBYUV_API +int I444ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Convert I444 to RAW. +LIBYUV_API +int I444ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + // Convert I010 to ARGB. LIBYUV_API int I010ToARGB(const uint16_t* src_y, @@ -1312,6 +1340,32 @@ int J420ToRAW(const uint8_t* src_y, int width, int height); +// Convert I422 to RGB24. +LIBYUV_API +int I422ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Convert I422 to RAW. +LIBYUV_API +int I422ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + LIBYUV_API int I420ToRGB565(const uint8_t* src_y, int src_stride_y, @@ -1495,6 +1549,20 @@ int I444ToARGBMatrix(const uint8_t* src_y, int width, int height); +// Convert I444 to RGB24 with matrix. +LIBYUV_API +int I444ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert 10 bit 420 YUV to ARGB with matrix. LIBYUV_API int I010ToAR30Matrix(const uint16_t* src_y, @@ -1893,6 +1961,20 @@ int I420ToRGB24Matrix(const uint8_t* src_y, int width, int height); +// Convert I422 to RGB24 with matrix. +LIBYUV_API +int I422ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert I420 to RGB565 with specified color matrix. LIBYUV_API int I420ToRGB565Matrix(const uint8_t* src_y, @@ -1907,6 +1989,20 @@ int I420ToRGB565Matrix(const uint8_t* src_y, int width, int height); +// Convert I422 to RGB565 with specified color matrix. +LIBYUV_API +int I422ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert I420 to AR30 with matrix. LIBYUV_API int I420ToAR30Matrix(const uint8_t* src_y, @@ -1961,6 +2057,36 @@ int I422ToARGBMatrixFilter(const uint8_t* src_y, int height, enum FilterMode filter); +// Convert I422 to RGB24 with matrix and UV filter mode. +LIBYUV_API +int I422ToRGB24MatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter); + +// Convert I420 to RGB24 with matrix and UV filter mode. +LIBYUV_API +int I420ToRGB24MatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter); + // Convert I010 to AR30 with matrix and UV filter mode. LIBYUV_API int I010ToAR30MatrixFilter(const uint16_t* src_y, diff --git a/files/include/libyuv/convert_from_argb.h b/files/include/libyuv/convert_from_argb.h index 2a488838..ff2a581a 100644 --- a/files/include/libyuv/convert_from_argb.h +++ b/files/include/libyuv/convert_from_argb.h @@ -209,10 +209,10 @@ int ARGBToJ420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height); @@ -222,10 +222,10 @@ int ARGBToJ422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height); @@ -238,6 +238,41 @@ int ARGBToJ400(const uint8_t* src_argb, int width, int height); +// Convert ABGR to J420. (JPeg full range I420). +LIBYUV_API +int ABGRToJ420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height); + +// Convert ABGR to J422. +LIBYUV_API +int ABGRToJ422(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height); + +// Convert ABGR to J400. (JPeg full range). +LIBYUV_API +int ABGRToJ400(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height); + // Convert RGBA to J400. (JPeg full range). LIBYUV_API int RGBAToJ400(const uint8_t* src_rgba, diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h index fb90c6c7..203f7e0d 100644 --- a/files/include/libyuv/cpu_id.h +++ b/files/include/libyuv/cpu_id.h @@ -55,6 +55,11 @@ static const int kCpuHasLOONGARCH = 0x2000000; static const int kCpuHasLSX = 0x4000000; static const int kCpuHasLASX = 0x8000000; +// These flags are only valid on RISCV processors. +static const int kCpuHasRISCV = 0x10000000; +static const int kCpuHasRVV = 0x20000000; +static const int kCpuHasRVVZVFH = 0x40000000; + // Optional init function. TestCpuFlag does an auto-init. // Returns cpu_info flags. LIBYUV_API @@ -78,6 +83,8 @@ LIBYUV_API int ArmCpuCaps(const char* cpuinfo_name); LIBYUV_API int MipsCpuCaps(const char* cpuinfo_name); +LIBYUV_API +int RiscvCpuCaps(const char* cpuinfo_name); // For testing, allow CPU flags to be disabled. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h index 1ef2256b..154f2f21 100644 --- a/files/include/libyuv/planar_functions.h +++ b/files/include/libyuv/planar_functions.h @@ -85,13 +85,23 @@ void SetPlane(uint8_t* dst_y, // Convert a plane of tiles of 16 x H to linear. LIBYUV_API -void DetilePlane(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height, - int tile_height); +int DetilePlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + int tile_height); + +// Convert a plane of 16 bit tiles of 16 x H to linear. +LIBYUV_API +int DetilePlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int tile_height); // Convert a UV plane of tiles of 16 x H into linear U and V planes. LIBYUV_API @@ -105,6 +115,18 @@ void DetileSplitUVPlane(const uint8_t* src_uv, int height, int tile_height); +// Convert a Y and UV plane of tiles into interlaced YUY2. +LIBYUV_API +void DetileToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height, + int tile_height); + // Split interleaved UV plane into separate U and V planes. LIBYUV_API void SplitUVPlane(const uint8_t* src_uv, @@ -370,7 +392,26 @@ int I210Copy(const uint16_t* src_y, int width, int height); +// Copy I410 to I410. +#define I410ToI410 I410Copy +LIBYUV_API +int I410Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + // Copy NV12. Supports inverting. +LIBYUV_API int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, @@ -383,6 +424,7 @@ int NV12Copy(const uint8_t* src_y, int height); // Copy NV21. Supports inverting. +LIBYUV_API int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h index 684ed5e6..37460c4a 100644 --- a/files/include/libyuv/rotate.h +++ b/files/include/libyuv/rotate.h @@ -85,6 +85,60 @@ int I444Rotate(const uint8_t* src_y, int height, enum RotationMode mode); +// Rotate I010 frame. +LIBYUV_API +int I010Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); + +// Rotate I210 frame. +LIBYUV_API +int I210Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); + +// Rotate I410 frame. +LIBYUV_API +int I410Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); + // Rotate NV12 input and store in I420. LIBYUV_API int NV12ToI420Rotate(const uint8_t* src_y, @@ -156,6 +210,16 @@ void RotatePlane270(const uint8_t* src, int width, int height); +// Rotate a plane by 0, 90, 180, or 270. +LIBYUV_API +int RotatePlane_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height, + enum RotationMode mode); + // Rotations for when U and V are interleaved. // These functions take one UV input pointer and // split the data into two buffers while diff --git a/files/include/libyuv/rotate_row.h b/files/include/libyuv/rotate_row.h index aa8528a9..2dd8c03d 100644 --- a/files/include/libyuv/rotate_row.h +++ b/files/include/libyuv/rotate_row.h @@ -42,6 +42,8 @@ extern "C" { // The following are available for GCC 32 or 64 bit: #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) #define HAS_TRANSPOSEWX8_SSSE3 +#define HAS_TRANSPOSE4X4_32_SSE2 +#define HAS_TRANSPOSE4X4_32_AVX2 #endif // The following are available for 64 bit GCC: @@ -54,6 +56,7 @@ extern "C" { (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_TRANSPOSEWX8_NEON #define HAS_TRANSPOSEUVWX8_NEON +#define HAS_TRANSPOSE4X4_32_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -215,6 +218,48 @@ void TransposeUVWx16_Any_LSX(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width); +void TransposeWxH_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height); + +void TransposeWx8_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width); +void TransposeWx1_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width); + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + +void Transpose4x4_32_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + +void Transpose4x4_32_AVX2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + +void Transpose4x4_32_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); #ifdef __cplusplus } // extern "C" diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h index 1a1cf4b6..5b244d77 100644 --- a/files/include/libyuv/row.h +++ b/files/include/libyuv/row.h @@ -11,7 +11,8 @@ #ifndef INCLUDE_LIBYUV_ROW_H_ #define INCLUDE_LIBYUV_ROW_H_ -#include // For malloc. +#include // For NULL +#include // For malloc #include "libyuv/basic_types.h" @@ -75,9 +76,6 @@ extern "C" { (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) // Conversions: #define HAS_ABGRTOYROW_SSSE3 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ABGRTOUVROW_SSSE3 -#endif #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGBEXTRACTALPHAROW_SSE2 @@ -92,12 +90,6 @@ extern "C" { #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBTOUV444ROW_SSSE3 -#define HAS_ARGBTOUVJROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 -#define HAS_BGRATOUVROW_SSSE3 -#endif #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 @@ -111,6 +103,7 @@ extern "C" { #define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 +#define HAS_I444TORGB24ROW_SSSE3 #define HAS_INTERPOLATEROW_SSSE3 #define HAS_J400TOARGBROW_SSE2 #define HAS_J422TOARGBROW_SSSE3 @@ -124,16 +117,13 @@ extern "C" { #define HAS_NV21TORGB24ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTORGB24ROW_SSSE3 +#define HAS_RAWTOYJROW_SSSE3 #define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 +#define HAS_RGB24TOYJROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGBATOYROW_SSSE3 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_RGB24TOYJROW_SSSE3 -#define HAS_RAWTOYJROW_SSSE3 -#define HAS_RGBATOUVROW_SSSE3 -#endif #define HAS_SETROW_ERMS #define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 @@ -145,13 +135,18 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ARGBTOUV444ROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_RGBATOUVROW_SSSE3 +#endif // Effects: #define HAS_ARGBADDROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBATTENUATEROW_SSSE3 -#endif #define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 @@ -176,6 +171,9 @@ extern "C" { #define HAS_SOBELXROW_SSE2 #define HAS_SOBELXYROW_SSE2 #define HAS_SOBELYROW_SSE2 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ARGBATTENUATEROW_SSSE3 +#endif // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. @@ -201,17 +199,10 @@ extern "C" { #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 -#define HAS_RAWTOYJROW_AVX2 -#define HAS_RGB24TOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBTOUVJROW_AVX2 -#define HAS_ARGBTOUVROW_AVX2 -#endif #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 -// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 @@ -219,6 +210,7 @@ extern "C" { #define HAS_I422TORGB565ROW_AVX2 #define HAS_I422TORGBAROW_AVX2 #define HAS_I444TOARGBROW_AVX2 +#define HAS_I444TORGB24ROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #define HAS_J422TOARGBROW_AVX2 #define HAS_MERGEUVROW_AVX2 @@ -228,6 +220,8 @@ extern "C" { #define HAS_NV12TORGB565ROW_AVX2 #define HAS_NV21TOARGBROW_AVX2 #define HAS_NV21TORGB24ROW_AVX2 +#define HAS_RAWTOYJROW_AVX2 +#define HAS_RGB24TOYJROW_AVX2 #define HAS_SPLITUVROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2 @@ -237,16 +231,21 @@ extern "C" { #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 +// #define HAS_HALFFLOATROW_F16C // Enable to test half float cast +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ARGBTOUVJROW_AVX2 +#define HAS_ARGBTOUVROW_AVX2 +#endif // Effects: #define HAS_ARGBADDROW_AVX2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBATTENUATEROW_AVX2 -#endif #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ARGBATTENUATEROW_AVX2 +#endif #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ defined(_MSC_VER) @@ -282,28 +281,32 @@ extern "C" { // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#define HAS_AB64TOARGBROW_SSSE3 #define HAS_ABGRTOAR30ROW_SSSE3 +#define HAS_ABGRTOYJROW_SSSE3 +#define HAS_AR64TOARGBROW_SSSE3 +#define HAS_ARGBTOAB64ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_ARGBTOAR64ROW_SSSE3 -#define HAS_ARGBTOAB64ROW_SSSE3 -#define HAS_AR64TOARGBROW_SSSE3 -#define HAS_AB64TOARGBROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 #define HAS_DETILEROW_SSE2 +#define HAS_DETILEROW_16_SSE2 +#define HAS_DETILEROW_16_AVX #define HAS_DETILESPLITUVROW_SSSE3 +#define HAS_DETILETOYUY2_SSE2 #define HAS_HALFMERGEUVROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 #define HAS_I212TOAR30ROW_SSSE3 #define HAS_I212TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 -#define HAS_I422TOAR30ROW_SSSE3 #define HAS_I410TOAR30ROW_SSSE3 #define HAS_I410TOARGBROW_SSSE3 +#define HAS_I422TOAR30ROW_SSSE3 #define HAS_MERGEARGBROW_SSE2 -#define HAS_MERGEXRGBROW_SSE2 #define HAS_MERGERGBROW_SSSE3 +#define HAS_MERGEXRGBROW_SSE2 #define HAS_MIRRORUVROW_SSSE3 #define HAS_NV21TOYUV24ROW_SSSE3 #define HAS_P210TOAR30ROW_SSSE3 @@ -312,15 +315,17 @@ extern "C" { #define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3 -#if !defined(LIBYUV_BIT_EXACT) #define HAS_RGBATOYJROW_SSSE3 -#endif #define HAS_SPLITARGBROW_SSE2 #define HAS_SPLITARGBROW_SSSE3 +#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITXRGBROW_SSE2 #define HAS_SPLITXRGBROW_SSSE3 -#define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 +#define HAS_YUY2TONVUVROW_SSE2 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ABGRTOUVJROW_SSSE3 +#endif #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 @@ -335,31 +340,20 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__)) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_AB64TOARGBROW_AVX2 #define HAS_ABGRTOAR30ROW_AVX2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ABGRTOUVROW_AVX2 +#define HAS_ABGRTOYJROW_AVX2 #define HAS_ABGRTOYROW_AVX2 -#endif +#define HAS_AR64TOARGBROW_AVX2 +#define HAS_ARGBTOAB64ROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2 +#define HAS_ARGBTOAR64ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 -#define HAS_ARGBTOAR64ROW_AVX2 -#define HAS_ARGBTOAB64ROW_AVX2 -#define HAS_AR64TOARGBROW_AVX2 -#define HAS_AB64TOARGBROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 -#define HAS_INTERPOLATEROW_16TO8_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 -#define HAS_MERGEAR64ROW_AVX2 -#define HAS_MERGEARGB16TO8ROW_AVX2 -#define HAS_MERGEARGBROW_AVX2 -#define HAS_MERGEXR30ROW_AVX2 -#define HAS_MERGEXR64ROW_AVX2 -#define HAS_MERGEXRGB16TO8ROW_AVX2 -#define HAS_MERGEXRGBROW_AVX2 -#define HAS_NV21TOYUV24ROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 #define HAS_I212TOAR30ROW_AVX2 @@ -367,23 +361,35 @@ extern "C" { #define HAS_I400TOARGBROW_AVX2 #define HAS_I410TOAR30ROW_AVX2 #define HAS_I410TOARGBROW_AVX2 -#define HAS_P210TOAR30ROW_AVX2 -#define HAS_P210TOARGBROW_AVX2 -#define HAS_P410TOAR30ROW_AVX2 -#define HAS_P410TOARGBROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 #define HAS_I422TOUYVYROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2 +#define HAS_INTERPOLATEROW_16TO8_AVX2 +#define HAS_MERGEAR64ROW_AVX2 +#define HAS_MERGEARGB16TO8ROW_AVX2 +#define HAS_MERGEARGBROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 +#define HAS_MERGEXR30ROW_AVX2 +#define HAS_MERGEXR64ROW_AVX2 +#define HAS_MERGEXRGB16TO8ROW_AVX2 +#define HAS_MERGEXRGBROW_AVX2 #define HAS_MIRRORUVROW_AVX2 #define HAS_MULTIPLYROW_16_AVX2 -#if !defined(LIBYUV_BIT_EXACT) +#define HAS_NV21TOYUV24ROW_AVX2 +#define HAS_P210TOAR30ROW_AVX2 +#define HAS_P210TOARGBROW_AVX2 +#define HAS_P410TOAR30ROW_AVX2 +#define HAS_P410TOARGBROW_AVX2 #define HAS_RGBATOYJROW_AVX2 -#endif #define HAS_SPLITARGBROW_AVX2 -#define HAS_SPLITXRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 +#define HAS_SPLITXRGBROW_AVX2 #define HAS_SWAPUVROW_AVX2 +#define HAS_YUY2TONVUVROW_AVX2 +#if !defined(LIBYUV_BIT_EXACT) +#define HAS_ABGRTOUVJROW_AVX2 +#define HAS_ABGRTOUVROW_AVX2 +#endif #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 @@ -397,8 +403,9 @@ extern "C" { // TODO(fbarchard): Port to GCC and Visual C // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 #if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512)) + (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) #define HAS_ARGBTORGB24ROW_AVX512VBMI +#define HAS_MERGEUVROW_AVX512BW #endif // The following are available for AVX512 clang x64 platforms: @@ -412,7 +419,9 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_AB64TOARGBROW_NEON +#define HAS_ABGRTOUVJROW_NEON #define HAS_ABGRTOUVROW_NEON +#define HAS_ABGRTOYJROW_NEON #define HAS_ABGRTOYROW_NEON #define HAS_AR64TOARGBROW_NEON #define HAS_ARGB1555TOARGBROW_NEON @@ -444,8 +453,11 @@ extern "C" { #define HAS_BYTETOFLOATROW_NEON #define HAS_CONVERT16TO8ROW_NEON #define HAS_COPYROW_NEON +#define HAS_DETILEROW_16_NEON #define HAS_DETILEROW_NEON #define HAS_DETILESPLITUVROW_NEON +#define HAS_DETILETOYUY2_NEON +#define HAS_UNPACKMT2T_NEON #define HAS_DIVIDEROW_16_NEON #define HAS_HALFFLOATROW_NEON #define HAS_HALFMERGEUVROW_NEON @@ -461,6 +473,7 @@ extern "C" { #define HAS_I422TOYUY2ROW_NEON #define HAS_I444ALPHATOARGBROW_NEON #define HAS_I444TOARGBROW_NEON +#define HAS_I444TORGB24ROW_NEON #define HAS_INTERPOLATEROW_16_NEON #define HAS_INTERPOLATEROW_NEON #define HAS_J400TOARGBROW_NEON @@ -513,6 +526,7 @@ extern "C" { #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON #define HAS_YUY2TOARGBROW_NEON +#define HAS_YUY2TONVUVROW_NEON #define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON @@ -524,13 +538,13 @@ extern "C" { #define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBGRAYROW_NEON #define HAS_ARGBMIRRORROW_NEON -#define HAS_RGB24MIRRORROW_NEON #define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBQUANTIZEROW_NEON #define HAS_ARGBSEPIAROW_NEON #define HAS_ARGBSHADEROW_NEON #define HAS_ARGBSHUFFLEROW_NEON #define HAS_ARGBSUBTRACTROW_NEON +#define HAS_RGB24MIRRORROW_NEON #define HAS_SOBELROW_NEON #define HAS_SOBELTOPLANEROW_NEON #define HAS_SOBELXROW_NEON @@ -540,12 +554,13 @@ extern "C" { // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_GAUSSCOL_F32_NEON +#define HAS_GAUSSROW_F32_NEON #define HAS_INTERPOLATEROW_16TO8_NEON #define HAS_SCALESUMSAMPLES_NEON -#define HAS_GAUSSROW_F32_NEON -#define HAS_GAUSSCOL_F32_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_ABGRTOUVJROW_MSA #define HAS_ABGRTOUVROW_MSA #define HAS_ABGRTOYROW_MSA #define HAS_ARGB1555TOARGBROW_MSA @@ -581,27 +596,25 @@ extern "C" { #define HAS_BGRATOYROW_MSA #define HAS_HALFFLOATROW_MSA #define HAS_I400TOARGBROW_MSA -#define HAS_I422TOUYVYROW_MSA -#define HAS_I422TOYUY2ROW_MSA -#define HAS_I422TOARGBROW_MSA -#define HAS_I422TORGBAROW_MSA #define HAS_I422ALPHATOARGBROW_MSA +#define HAS_I422TOARGB1555ROW_MSA +#define HAS_I422TOARGB4444ROW_MSA +#define HAS_I422TOARGBROW_MSA #define HAS_I422TORGB24ROW_MSA #define HAS_I422TORGB565ROW_MSA -#define HAS_I422TOARGB4444ROW_MSA -#define HAS_I422TOARGB1555ROW_MSA -#define HAS_NV12TOARGBROW_MSA -#define HAS_NV12TORGB565ROW_MSA -#define HAS_NV21TOARGBROW_MSA -#define HAS_YUY2TOARGBROW_MSA -#define HAS_UYVYTOARGBROW_MSA +#define HAS_I422TORGBAROW_MSA +#define HAS_I422TOUYVYROW_MSA +#define HAS_I422TOYUY2ROW_MSA #define HAS_I444TOARGBROW_MSA #define HAS_INTERPOLATEROW_MSA #define HAS_J400TOARGBROW_MSA #define HAS_MERGEUVROW_MSA #define HAS_MIRRORROW_MSA -#define HAS_MIRRORUVROW_MSA #define HAS_MIRRORSPLITUVROW_MSA +#define HAS_MIRRORUVROW_MSA +#define HAS_NV12TOARGBROW_MSA +#define HAS_NV12TORGB565ROW_MSA +#define HAS_NV21TOARGBROW_MSA #define HAS_RAWTOARGBROW_MSA #define HAS_RAWTORGB24ROW_MSA #define HAS_RAWTOUVROW_MSA @@ -621,113 +634,208 @@ extern "C" { #define HAS_SOBELXYROW_MSA #define HAS_SOBELYROW_MSA #define HAS_SPLITUVROW_MSA +#define HAS_UYVYTOARGBROW_MSA #define HAS_UYVYTOUVROW_MSA #define HAS_UYVYTOYROW_MSA +#define HAS_YUY2TOARGBROW_MSA #define HAS_YUY2TOUV422ROW_MSA #define HAS_YUY2TOUVROW_MSA #define HAS_YUY2TOYROW_MSA #endif #if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) -#define HAS_ARGB4444TOARGBROW_LSX +#define HAS_ABGRTOUVROW_LSX +#define HAS_ABGRTOYROW_LSX #define HAS_ARGB1555TOARGBROW_LSX -#define HAS_RGB565TOARGBROW_LSX -#define HAS_RGB24TOARGBROW_LSX -#define HAS_RAWTOARGBROW_LSX -#define HAS_ARGB1555TOYROW_LSX #define HAS_ARGB1555TOUVROW_LSX -#define HAS_RGB565TOYROW_LSX -#define HAS_RGB565TOUVROW_LSX -#define HAS_RGB24TOYROW_LSX -#define HAS_RGB24TOUVROW_LSX -#define HAS_RAWTOYROW_LSX -#define HAS_RAWTOUVROW_LSX +#define HAS_ARGB1555TOYROW_LSX +#define HAS_ARGB4444TOARGBROW_LSX +#define HAS_ARGBADDROW_LSX +#define HAS_ARGBATTENUATEROW_LSX +#define HAS_ARGBBLENDROW_LSX +#define HAS_ARGBCOLORMATRIXROW_LSX +#define HAS_ARGBEXTRACTALPHAROW_LSX +#define HAS_ARGBGRAYROW_LSX +#define HAS_ARGBSEPIAROW_LSX +#define HAS_ARGBSHADEROW_LSX +#define HAS_ARGBSHUFFLEROW_LSX +#define HAS_ARGBSUBTRACTROW_LSX +#define HAS_ARGBQUANTIZEROW_LSX +#define HAS_ARGBSETROW_LSX +#define HAS_ARGBTOARGB1555ROW_LSX +#define HAS_ARGBTOARGB4444ROW_LSX +#define HAS_ARGBTORAWROW_LSX +#define HAS_ARGBTORGB24ROW_LSX +#define HAS_ARGBTORGB565ROW_LSX +#define HAS_ARGBTORGB565DITHERROW_LSX +#define HAS_ARGBTOUVJROW_LSX +#define HAS_ARGBTOUV444ROW_LSX +#define HAS_ARGBTOUVROW_LSX +#define HAS_ARGBTOYJROW_LSX +#define HAS_ARGBMIRRORROW_LSX +#define HAS_ARGBMULTIPLYROW_LSX +#define HAS_BGRATOUVROW_LSX +#define HAS_BGRATOYROW_LSX +#define HAS_I400TOARGBROW_LSX +#define HAS_I444TOARGBROW_LSX +#define HAS_INTERPOLATEROW_LSX +#define HAS_I422ALPHATOARGBROW_LSX +#define HAS_I422TOARGB1555ROW_LSX +#define HAS_I422TOARGB4444ROW_LSX +#define HAS_I422TORGB24ROW_LSX +#define HAS_I422TORGB565ROW_LSX +#define HAS_I422TORGBAROW_LSX +#define HAS_I422TOUYVYROW_LSX +#define HAS_I422TOYUY2ROW_LSX +#define HAS_J400TOARGBROW_LSX +#define HAS_MERGEUVROW_LSX +#define HAS_MIRRORROW_LSX +#define HAS_MIRRORUVROW_LSX +#define HAS_MIRRORSPLITUVROW_LSX #define HAS_NV12TOARGBROW_LSX #define HAS_NV12TORGB565ROW_LSX #define HAS_NV21TOARGBROW_LSX +#define HAS_RAWTOARGBROW_LSX +#define HAS_RAWTORGB24ROW_LSX +#define HAS_RAWTOUVROW_LSX +#define HAS_RAWTOYROW_LSX +#define HAS_RGB24TOARGBROW_LSX +#define HAS_RGB24TOUVROW_LSX +#define HAS_RGB24TOYROW_LSX +#define HAS_RGB565TOARGBROW_LSX +#define HAS_RGB565TOUVROW_LSX +#define HAS_RGB565TOYROW_LSX +#define HAS_RGBATOUVROW_LSX +#define HAS_RGBATOYROW_LSX +#define HAS_SETROW_LSX #define HAS_SOBELROW_LSX #define HAS_SOBELTOPLANEROW_LSX #define HAS_SOBELXYROW_LSX -#define HAS_ARGBTOYJROW_LSX -#define HAS_BGRATOYROW_LSX -#define HAS_BGRATOUVROW_LSX -#define HAS_ABGRTOYROW_LSX -#define HAS_ABGRTOUVROW_LSX -#define HAS_RGBATOYROW_LSX -#define HAS_RGBATOUVROW_LSX -#define HAS_ARGBTOUVJROW_LSX -#define HAS_I444TOARGBROW_LSX -#define HAS_I400TOARGBROW_LSX -#define HAS_J400TOARGBROW_LSX -#define HAS_YUY2TOARGBROW_LSX -#define HAS_UYVYTOARGBROW_LSX -#define HAS_INTERPOLATEROW_LSX -#define HAS_ARGBSETROW_LSX -#define HAS_RAWTORGB24ROW_LSX -#define HAS_MERGEUVROW_LSX -#define HAS_ARGBEXTRACTALPHAROW_LSX -#define HAS_ARGBBLENDROW_LSX -#define HAS_ARGBQUANTIZEROW_LSX -#define HAS_ARGBCOLORMATRIXROW_LSX #define HAS_SPLITUVROW_LSX -#define HAS_SETROW_LSX -#define HAS_MIRRORSPLITUVROW_LSX +#define HAS_UYVYTOARGBROW_LSX +#define HAS_UYVYTOUV422ROW_LSX +#define HAS_UYVYTOUVROW_LSX +#define HAS_UYVYTOYROW_LSX +#define HAS_YUY2TOARGBROW_LSX +#define HAS_YUY2TOUVROW_LSX +#define HAS_YUY2TOUV422ROW_LSX +#define HAS_YUY2TOYROW_LSX +#define HAS_ARGBTOYROW_LSX +#define HAS_ABGRTOYJROW_LSX +#define HAS_RGBATOYJROW_LSX +#define HAS_RGB24TOYJROW_LSX +#define HAS_RAWTOYJROW_LSX +#endif + +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#define HAS_I422TOARGBROW_LSX #endif #if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx) +#define HAS_ARGB1555TOARGBROW_LASX +#define HAS_ARGB1555TOUVROW_LASX +#define HAS_ARGB1555TOYROW_LASX +#define HAS_ARGB4444TOARGBROW_LASX +#define HAS_ARGBADDROW_LASX +#define HAS_ARGBATTENUATEROW_LASX +#define HAS_ARGBGRAYROW_LASX +#define HAS_ARGBMIRRORROW_LASX +#define HAS_ARGBMULTIPLYROW_LASX +#define HAS_ARGBSEPIAROW_LASX +#define HAS_ARGBSHADEROW_LASX +#define HAS_ARGBSHUFFLEROW_LASX +#define HAS_ARGBSUBTRACTROW_LASX +#define HAS_ARGBTOARGB1555ROW_LASX +#define HAS_ARGBTOARGB4444ROW_LASX +#define HAS_ARGBTORAWROW_LASX +#define HAS_ARGBTORGB24ROW_LASX +#define HAS_ARGBTORGB565DITHERROW_LASX +#define HAS_ARGBTORGB565ROW_LASX +#define HAS_ARGBTOUV444ROW_LASX +#define HAS_ARGBTOUVJROW_LASX +#define HAS_ARGBTOUVROW_LASX +#define HAS_ARGBTOYJROW_LASX +#define HAS_ARGBTOYROW_LASX +#define HAS_ABGRTOYJROW_LASX +#define HAS_ABGRTOYROW_LASX +#define HAS_I422ALPHATOARGBROW_LASX +#define HAS_I422TOARGB1555ROW_LASX +#define HAS_I422TOARGB4444ROW_LASX #define HAS_I422TOARGBROW_LASX +#define HAS_I422TORGB24ROW_LASX +#define HAS_I422TORGB565ROW_LASX #define HAS_I422TORGBAROW_LASX -#define HAS_I422ALPHATOARGBROW_LASX -#define HAS_I422TOYUY2ROW_LASX #define HAS_I422TOUYVYROW_LASX +#define HAS_I422TOYUY2ROW_LASX #define HAS_MIRRORROW_LASX #define HAS_MIRRORUVROW_LASX -#define HAS_ARGBMIRRORROW_LASX -#define HAS_I422TORGB24ROW_LASX -#define HAS_I422TORGB565ROW_LASX -#define HAS_I422TOARGB4444ROW_LASX -#define HAS_I422TOARGB1555ROW_LASX -#define HAS_YUY2TOUVROW_LASX -#define HAS_YUY2TOYROW_LASX -#define HAS_YUY2TOUV422ROW_LASX -#define HAS_UYVYTOYROW_LASX -#define HAS_UYVYTOUVROW_LASX -#define HAS_UYVYTOUV422ROW_LASX -#define HAS_ARGBTOYROW_LASX -#define HAS_ARGBTOUVROW_LASX -#define HAS_ARGBTORGB24ROW_LASX -#define HAS_ARGBTORAWROW_LASX -#define HAS_ARGBTORGB565ROW_LASX -#define HAS_ARGBTOARGB1555ROW_LASX -#define HAS_ARGBTOARGB4444ROW_LASX -#define HAS_ARGBTOUV444ROW_LASX -#define HAS_ARGBMULTIPLYROW_LASX -#define HAS_ARGBADDROW_LASX -#define HAS_ARGBSUBTRACTROW_LASX -#define HAS_ARGBATTENUATEROW_LASX -#define HAS_ARGBTORGB565DITHERROW_LASX -#define HAS_ARGBSHUFFLEROW_LASX -#define HAS_ARGBSHADEROW_LASX -#define HAS_ARGBGRAYROW_LASX -#define HAS_ARGBSEPIAROW_LASX -#define HAS_ARGB4444TOARGBROW_LASX -#define HAS_ARGB1555TOARGBROW_LASX -#define HAS_RGB565TOARGBROW_LASX -#define HAS_RGB24TOARGBROW_LASX -#define HAS_RAWTOARGBROW_LASX -#define HAS_ARGB1555TOYROW_LASX -#define HAS_ARGB1555TOUVROW_LASX -#define HAS_RGB565TOYROW_LASX -#define HAS_RGB565TOUVROW_LASX -#define HAS_RGB24TOYROW_LASX -#define HAS_RGB24TOUVROW_LASX -#define HAS_RAWTOYROW_LASX -#define HAS_RAWTOUVROW_LASX #define HAS_NV12TOARGBROW_LASX #define HAS_NV12TORGB565ROW_LASX #define HAS_NV21TOARGBROW_LASX -#define HAS_ARGBTOYJROW_LASX -#define HAS_ARGBTOUVJROW_LASX +#define HAS_RAWTOARGBROW_LASX +#define HAS_RAWTOUVROW_LASX +#define HAS_RAWTOYROW_LASX +#define HAS_RGB24TOARGBROW_LASX +#define HAS_RGB24TOUVROW_LASX +#define HAS_RGB24TOYROW_LASX +#define HAS_RGB565TOARGBROW_LASX +#define HAS_RGB565TOUVROW_LASX +#define HAS_RGB565TOYROW_LASX +#define HAS_UYVYTOUV422ROW_LASX +#define HAS_UYVYTOUVROW_LASX +#define HAS_UYVYTOYROW_LASX +#define HAS_YUY2TOUV422ROW_LASX +#define HAS_YUY2TOUVROW_LASX +#define HAS_YUY2TOYROW_LASX +#define HAS_RGBATOYROW_LASX +#define HAS_RGBATOYJROW_LASX +#define HAS_BGRATOYROW_LASX +#define HAS_RGB24TOYJROW_LASX +#define HAS_RAWTOYJROW_LASX +#endif + +#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +#define HAS_AB64TOARGBROW_RVV +#define HAS_AR64TOARGBROW_RVV +#define HAS_ARGBATTENUATEROW_RVV +#define HAS_ARGBTOAB64ROW_RVV +#define HAS_ARGBTOAR64ROW_RVV +#define HAS_ARGBTORAWROW_RVV +#define HAS_ARGBTORGB24ROW_RVV +#define HAS_ARGBTOYROW_RVV +#define HAS_ARGBTOYJROW_RVV +#define HAS_ABGRTOYROW_RVV +#define HAS_ABGRTOYJROW_RVV +#define HAS_BGRATOYROW_RVV +#define HAS_COPYROW_RVV +#define HAS_I400TOARGBROW_RVV +#define HAS_I422ALPHATOARGBROW_RVV +#define HAS_I422TOARGBROW_RVV +#define HAS_I422TORGB24ROW_RVV +#define HAS_I422TORGBAROW_RVV +#define HAS_I444ALPHATOARGBROW_RVV +#define HAS_I444TOARGBROW_RVV +#define HAS_I444TORGB24ROW_RVV +#define HAS_INTERPOLATEROW_RVV +#define HAS_J400TOARGBROW_RVV +#define HAS_MERGEARGBROW_RVV +#define HAS_MERGERGBROW_RVV +#define HAS_MERGEUVROW_RVV +#define HAS_MERGEXRGBROW_RVV +#define HAS_SPLITARGBROW_RVV +#define HAS_SPLITRGBROW_RVV +#define HAS_SPLITUVROW_RVV +#define HAS_SPLITXRGBROW_RVV +#define HAS_RAWTOARGBROW_RVV +#define HAS_RAWTORGB24ROW_RVV +#define HAS_RAWTORGBAROW_RVV +#define HAS_RAWTOYJROW_RVV +#define HAS_RAWTOYROW_RVV +#define HAS_RGB24TOARGBROW_RVV +#define HAS_RGB24TOYJROW_RVV +#define HAS_RGB24TOYROW_RVV +#define HAS_RGBATOYROW_RVV +#define HAS_RGBATOYJROW_RVV #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -789,8 +897,8 @@ typedef uint32_t ulvec32[8]; typedef uint8_t ulvec8[32]; #endif -#if defined(__aarch64__) || defined(__arm__) -// This struct is for ARM color conversion. +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +// This struct is for ARM and RISC-V color conversion. struct YuvConstants { uvec8 kUVCoeff; vec16 kRGBCoeffBias; @@ -816,13 +924,21 @@ struct YuvConstants { #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) -#define align_buffer_64(var, size) \ - uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \ - uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ +#define align_buffer_64(var, size) \ + void* var##_mem = malloc((size) + 63); /* NOLINT */ \ + uint8_t* var = (uint8_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */ #define free_aligned_buffer_64(var) \ free(var##_mem); \ - var = 0 + var = NULL + +#define align_buffer_64_16(var, size) \ + void* var##_mem = malloc((size)*2 + 63); /* NOLINT */ \ + uint16_t* var = (uint16_t*)(((intptr_t)var##_mem + 63) & ~63) /* NOLINT */ + +#define free_aligned_buffer_64_16(var) \ + free(var##_mem); \ + var = NULL #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) #define OMITFP @@ -894,6 +1010,12 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I444ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -981,6 +1103,50 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I444ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); void I444ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1000,6 +1166,12 @@ void I422ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1012,6 +1184,12 @@ void I422ToRGBARow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToRGBARow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1025,6 +1203,13 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422AlphaToARGBRow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1038,6 +1223,12 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB24Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB24Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1050,6 +1241,12 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB565Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB565Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1062,6 +1259,12 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB4444Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB4444Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1074,6 +1277,12 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB1555Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB1555Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1148,9 +1357,13 @@ void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); @@ -1164,13 +1377,23 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width); void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width); +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width); +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width); +void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width); +void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width); +void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width); void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_LSX(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width); +void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width); void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width); +void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width); void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1189,11 +1412,20 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_LSX(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_LASX(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_LSX(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_LASX(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1203,6 +1435,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width); void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -1258,6 +1495,11 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_MSA(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, @@ -1372,6 +1614,13 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width); +void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width); void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); @@ -1384,6 +1633,8 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width); void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, uint8_t* dst_y, int width); @@ -1393,9 +1644,15 @@ void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width); void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width); +void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); @@ -1409,6 +1666,7 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1423,6 +1681,7 @@ void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1453,10 +1712,15 @@ void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr, void BGRAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1465,7 +1729,14 @@ void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1485,6 +1756,11 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1495,6 +1771,11 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -1525,6 +1806,11 @@ void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -1535,6 +1821,11 @@ void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -1568,11 +1859,20 @@ void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_Any_LSX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_Any_LASX(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUV444Row_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -1582,6 +1882,11 @@ void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -1747,16 +2052,16 @@ void ARGBToUVJRow_C(const uint8_t* src_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_C(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVJRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void BGRAToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, @@ -1772,6 +2077,11 @@ void RGBAToUVRow_C(const uint8_t* src_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); +void RGBAToUVJRow_C(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void RGB24ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, @@ -1826,6 +2136,7 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1833,17 +2144,20 @@ void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width); +void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorSplitUVRow_SSSE3(const uint8_t* src, @@ -1867,10 +2181,13 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_v, int width); +void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width); + void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, @@ -1883,6 +2200,7 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBMirrorRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1925,6 +2243,10 @@ void SplitUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -1949,7 +2271,6 @@ void DetileRow_C(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width); - void DetileRow_NEON(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, @@ -1966,6 +2287,42 @@ void DetileRow_Any_SSE2(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width); +void DetileRow_AVX(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width); +void DetileRow_Any_AVX(const uint8_t* src, + ptrdiff_t src_tile_stride, + uint8_t* dst, + int width); +void DetileRow_16_C(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); +void DetileRow_16_NEON(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); +void DetileRow_16_Any_NEON(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); +void DetileRow_16_SSE2(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); +void DetileRow_16_Any_SSE2(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); +void DetileRow_16_AVX(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); +void DetileRow_16_Any_AVX(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width); void DetileSplitUVRow_C(const uint8_t* src_uv, ptrdiff_t src_tile_stride, uint8_t* dst_u, @@ -1991,6 +2348,38 @@ void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void DetileToYUY2_C(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); +void DetileToYUY2_SSE2(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); +void DetileToYUY2_Any_SSE2(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); +void DetileToYUY2_Any_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); +void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size); +void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size); void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -2003,6 +2392,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); +void MergeUVRow_AVX512BW(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -2015,6 +2408,10 @@ void MergeUVRow_LSX(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); void MergeUVRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -2023,6 +2420,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void MergeUVRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -2079,6 +2480,11 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width); +void SplitRGBRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -2105,6 +2511,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width); +void MergeRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2139,6 +2550,12 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width); +void MergeARGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width); void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2187,6 +2604,12 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_b, uint8_t* dst_a, int width); +void SplitARGBRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width); void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -2231,6 +2654,11 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width); +void MergeXRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width); void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2271,6 +2699,11 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_g, uint8_t* dst_b, int width); +void SplitXRGBRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -2604,8 +3037,8 @@ void Convert16To8Row_NEON(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); -void Convert16To8Row_Any_NEON(const uint16_t* src_y, - uint8_t* dst_y, +void Convert16To8Row_Any_NEON(const uint16_t* src_ptr, + uint8_t* dst_ptr, int scale, int width); @@ -2614,6 +3047,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count); void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -2713,6 +3147,10 @@ void ARGBShuffleRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); +void ARGBShuffleRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); void ARGBShuffleRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, @@ -2733,6 +3171,10 @@ void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); +void ARGBShuffleRow_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); void ARGBShuffleRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, @@ -2765,14 +3207,18 @@ void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_LSX(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RAWToRGB24Row_LSX(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); @@ -2932,15 +3378,15 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -2968,7 +3414,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, int width); void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width); void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -2981,23 +3427,39 @@ void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, int width); void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, + int width); +void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, int width); void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width); +void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB24Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); void ARGBToARGB1555Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB4444Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); void ARGBToARGB4444Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width); +void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width); + void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -3035,6 +3497,10 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width); void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width); void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width); void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width); +void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width); +void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width); +void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width); +void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width); void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); @@ -3077,6 +3543,7 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -3096,6 +3563,12 @@ void I444ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void I444ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -3290,6 +3763,18 @@ void I444ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I444ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void I444ToRGB24Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3631,12 +4116,24 @@ void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I444ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I444ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3823,13 +4320,13 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToYUV24Row_Any_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, +void NV21ToYUV24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, int width); -void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, +void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, int width); void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, @@ -3976,6 +4473,10 @@ void I400ToARGBRow_LSX(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I400ToARGBRow_RVV(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* param, @@ -4084,10 +4585,18 @@ void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBMultiplyRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); void ARGBMultiplyRow_LASX(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); +void ARGBMultiplyRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void ARGBMultiplyRow_Any_LASX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -4130,10 +4639,18 @@ void ARGBAddRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBAddRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); void ARGBAddRow_LASX(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); +void ARGBAddRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void ARGBAddRow_Any_LASX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -4177,10 +4694,18 @@ void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); +void ARGBSubtractRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); void ARGBSubtractRow_LASX(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); +void ARGBSubtractRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); void ARGBSubtractRow_Any_LASX(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -4273,21 +4798,37 @@ void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, int width); +void ARGBToRGB565DitherRow_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); void ARGBToRGB565DitherRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, int width); - +void ARGBToRGB24Row_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToRGB24Row_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRAWRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRAWRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToRGB565Row_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToARGB1555Row_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToARGB1555Row_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToARGB4444Row_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToARGB4444Row_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4298,6 +4839,12 @@ void I444ToARGBRow_Any_NEON(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I444ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4443,6 +4990,12 @@ void I422ToARGBRow_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4455,6 +5008,12 @@ void I422ToRGBARow_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGBARow_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4468,6 +5027,13 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4481,6 +5047,12 @@ void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB24Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB24Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4493,6 +5065,12 @@ void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToRGB565Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGB565Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4505,6 +5083,12 @@ void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB4444Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB4444Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4517,6 +5101,12 @@ void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGB1555Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToARGB1555Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4592,6 +5182,10 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4602,6 +5196,10 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4612,17 +5210,27 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToYRow_LASX(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUVRow_LSX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUVRow_LASX(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, @@ -4632,6 +5240,10 @@ void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUV422Row_LASX(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4642,6 +5254,10 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_C(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4652,6 +5268,10 @@ void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_Any_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4662,6 +5282,10 @@ void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4672,17 +5296,27 @@ void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_Any_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUVRow_Any_LSX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUVRow_Any_LASX(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, @@ -4692,6 +5326,10 @@ void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToUV422Row_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void YUY2ToUV422Row_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4737,12 +5375,18 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width); void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToYRow_LASX(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_MSA(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUVRow_LSX(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUVRow_LASX(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, @@ -4752,6 +5396,10 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUV422Row_LSX(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUV422Row_LASX(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, @@ -4798,12 +5446,18 @@ void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_v, int width); void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUVRow_Any_LSX(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUVRow_Any_LASX(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, @@ -4813,6 +5467,10 @@ void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void UYVYToUV422Row_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void UYVYToUV422Row_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4927,6 +5585,11 @@ void I422ToYUY2Row_MSA(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width); +void I422ToYUY2Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); void I422ToYUY2Row_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4937,6 +5600,11 @@ void I422ToUYVYRow_MSA(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width); +void I422ToUYVYRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); void I422ToUYVYRow_LASX(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4947,6 +5615,11 @@ void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void I422ToYUY2Row_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); void I422ToYUY2Row_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4957,6 +5630,11 @@ void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); +void I422ToUYVYRow_Any_LSX(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); void I422ToUYVYRow_Any_LASX(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -4977,9 +5655,15 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, void ARGBAttenuateRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBAttenuateRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); void ARGBAttenuateRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBAttenuateRow_RVV(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4992,6 +5676,9 @@ void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr, void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBAttenuateRow_Any_LSX(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBAttenuateRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -5018,12 +5705,14 @@ void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBSepiaRow_C(uint8_t* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width); +void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width); void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width); void ARGBColorMatrixRow_C(const uint8_t* src_argb, @@ -5103,6 +5792,10 @@ void ARGBShadeRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); +void ARGBShadeRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); void ARGBShadeRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width, @@ -5175,6 +5868,11 @@ void InterpolateRow_LSX(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); +void InterpolateRow_RVV(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); void InterpolateRow_Any_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, @@ -5526,6 +6224,17 @@ void GaussCol_F32_C(const float* src0, float* dst, int width); +void GaussRow_C(const uint32_t* src, uint16_t* dst, int width); +void GaussCol_C(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width); + +void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/include/libyuv/scale_row.h b/files/include/libyuv/scale_row.h index 6cb5e128..a7957c3f 100644 --- a/files/include/libyuv/scale_row.h +++ b/files/include/libyuv/scale_row.h @@ -133,6 +133,8 @@ extern "C" { #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEUVROWDOWN2_NEON +#define HAS_SCALEUVROWDOWN2LINEAR_NEON #define HAS_SCALEUVROWDOWN2BOX_NEON #define HAS_SCALEUVROWDOWNEVEN_NEON #define HAS_SCALEROWUP2_LINEAR_NEON @@ -214,6 +216,17 @@ void ScalePlaneVertical_16To8(int src_height, int scale, enum FilterMode filtering); +void ScalePlaneDown2_16To8(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, + enum FilterMode filtering); + // Simplify the filtering based on scale factors. enum FilterMode ScaleFilterReduce(int src_width, int src_height, @@ -259,6 +272,16 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); +void ScaleRowDown2_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale); +void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale); void ScaleRowDown2Linear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -267,6 +290,16 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); +void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale); +void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale); void ScaleRowDown2Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -279,6 +312,16 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); +void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale); +void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale); void ScaleRowDown4_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h index a85be048..b6623dbb 100644 --- a/files/include/libyuv/version.h +++ b/files/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1837 +#define LIBYUV_VERSION 1871 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/files/infra/config/PRESUBMIT.py b/files/infra/config/PRESUBMIT.py index 01ec0eed..f79e08ad 100644 --- a/files/infra/config/PRESUBMIT.py +++ b/files/infra/config/PRESUBMIT.py @@ -2,6 +2,8 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +USE_PYTHON3 = True + def CheckChangeOnUpload(input_api, output_api): return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api) diff --git a/files/infra/config/cr-buildbucket.cfg b/files/infra/config/cr-buildbucket.cfg index 061cf33b..be9d1d28 100644 --- a/files/infra/config/cr-buildbucket.cfg +++ b/files/infra/config/cr-buildbucket.cfg @@ -34,6 +34,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -65,6 +69,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -96,6 +104,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -111,7 +123,7 @@ buckets { name: "Android Tester ARM32 Debug (Nexus 5X)" swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" - dimensions: "device_type:bullhead" + dimensions: "device_type:walleye" dimensions: "pool:luci.flex.ci" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -124,6 +136,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -139,7 +155,7 @@ buckets { name: "Android Tester ARM32 Release (Nexus 5X)" swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" - dimensions: "device_type:bullhead" + dimensions: "device_type:walleye" dimensions: "pool:luci.flex.ci" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -152,6 +168,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -167,7 +187,7 @@ buckets { name: "Android Tester ARM64 Debug (Nexus 5X)" swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" - dimensions: "device_type:bullhead" + dimensions: "device_type:walleye" dimensions: "pool:luci.flex.ci" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -180,6 +200,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -211,6 +235,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -242,6 +270,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -273,6 +305,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -304,6 +340,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -335,6 +375,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -366,6 +410,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -397,6 +445,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -428,6 +480,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -459,6 +515,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -490,6 +550,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -521,6 +585,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -537,7 +605,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.ci" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -550,6 +618,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -566,7 +638,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.ci" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -579,6 +651,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -595,7 +671,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.ci" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -608,6 +684,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -639,6 +719,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -670,6 +754,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -701,6 +789,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -732,6 +824,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -763,6 +859,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -794,6 +894,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -825,6 +929,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -856,6 +964,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -872,7 +984,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.ci" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -885,6 +997,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -901,7 +1017,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.ci" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -914,6 +1030,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-trusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "client.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -985,7 +1105,7 @@ buckets { name: "android" swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" - dimensions: "device_type:bullhead" + dimensions: "device_type:walleye" dimensions: "pool:luci.flex.try" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -998,6 +1118,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1013,7 +1137,7 @@ buckets { name: "android_arm64" swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" - dimensions: "device_type:bullhead" + dimensions: "device_type:walleye" dimensions: "pool:luci.flex.try" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -1026,6 +1150,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1041,7 +1169,7 @@ buckets { name: "android_rel" swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" - dimensions: "device_type:bullhead" + dimensions: "device_type:walleye" dimensions: "pool:luci.flex.try" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -1054,6 +1182,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1085,6 +1217,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1116,6 +1252,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1132,7 +1272,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.try" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -1145,6 +1285,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1161,7 +1305,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.try" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -1174,6 +1318,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1205,6 +1353,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1236,6 +1388,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1267,6 +1423,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1298,6 +1458,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1329,6 +1493,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1360,6 +1528,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1391,6 +1563,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1422,6 +1598,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1438,7 +1618,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.try" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -1451,6 +1631,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1467,7 +1651,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.try" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -1480,6 +1664,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1496,7 +1684,7 @@ buckets { swarming_host: "chromium-swarm.appspot.com" swarming_tags: "vpython:native-python-wrapper" dimensions: "cpu:x86-64" - dimensions: "os:Mac-10.15" + dimensions: "os:Mac-12" dimensions: "pool:luci.flex.try" exe { cipd_package: "infra/recipe_bundles/chromium.googlesource.com/chromium/tools/build" @@ -1509,6 +1697,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1540,6 +1732,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "run_presubmit",' ' "repo_name": "libyuv",' @@ -1573,6 +1769,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1604,6 +1804,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1635,6 +1839,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1666,6 +1874,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1697,6 +1909,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' @@ -1728,6 +1944,10 @@ buckets { ' "server_host": "goma.chromium.org",' ' "use_luci_auth": true' ' },' + ' "$build/reclient": {' + ' "instance": "rbe-webrtc-untrusted",' + ' "metrics_project": "chromium-reclient-metrics"' + ' },' ' "builder_group": "tryserver.libyuv",' ' "recipe": "libyuv/libyuv"' '}' diff --git a/files/infra/config/main.star b/files/infra/config/main.star index b922ca02..7490a599 100755 --- a/files/infra/config/main.star +++ b/files/infra/config/main.star @@ -26,6 +26,16 @@ GOMA_BACKEND_RBE_NO_ATS_PROD = { "enable_ats": False, } +RECLIENT_CI = { + "instance": "rbe-webrtc-trusted", + "metrics_project": "chromium-reclient-metrics", +} + +RECLIENT_CQ = { + "instance": "rbe-webrtc-untrusted", + "metrics_project": "chromium-reclient-metrics", +} + # Use LUCI Scheduler BBv2 names and add Scheduler realms configs. lucicfg.enable_experiment("crbug.com/1182002") @@ -69,6 +79,10 @@ luci.project( acl.entry(acl.BUILDBUCKET_OWNER, groups = ["project-libyuv-admins"]), ], bindings = [ + luci.binding( + roles = "role/swarming.taskTriggerer", # for LED tasks. + groups = "project-libyuv-admins", + ), luci.binding( roles = "role/configs.validator", users = "libyuv-try-builder@chops-service-accounts.iam.gserviceaccount.com", @@ -195,9 +209,9 @@ luci.bucket( def get_os_dimensions(os): if os == "android": - return {"device_type": "bullhead"} + return {"device_type": "walleye"} if os == "ios" or os == "mac": - return {"os": "Mac-10.15", "cpu": "x86-64"} + return {"os": "Mac-12", "cpu": "x86-64"} elif os == "win": return {"os": "Windows-10", "cores": "8", "cpu": "x86-64"} elif os == "linux": @@ -255,6 +269,7 @@ def libyuv_try_builder(name, dimensions, properties, recipe_name = "libyuv/libyu def ci_builder(name, os, category, short_name = None): dimensions = get_os_dimensions(os) properties = get_os_properties(os) + properties["$build/reclient"] = RECLIENT_CI dimensions["pool"] = "luci.flex.ci" properties["builder_group"] = "client.libyuv" @@ -266,6 +281,7 @@ def ci_builder(name, os, category, short_name = None): def try_builder(name, os, experiment_percentage = None): dimensions = get_os_dimensions(os) properties = get_os_properties(os, try_builder = True) + properties["$build/reclient"] = RECLIENT_CQ dimensions["pool"] = "luci.flex.try" properties["builder_group"] = "tryserver.libyuv" diff --git a/files/infra/config/project.cfg b/files/infra/config/project.cfg index 700226ad..af79cfb2 100644 --- a/files/infra/config/project.cfg +++ b/files/infra/config/project.cfg @@ -7,7 +7,7 @@ name: "libyuv" access: "group:all" lucicfg { - version: "1.30.9" + version: "1.39.8" package_dir: "." config_dir: "." entry_point: "main.star" diff --git a/files/infra/config/realms.cfg b/files/infra/config/realms.cfg index ae04529e..16ffaac9 100644 --- a/files/infra/config/realms.cfg +++ b/files/infra/config/realms.cfg @@ -38,6 +38,10 @@ realms { role: "role/scheduler.reader" principals: "group:all" } + bindings { + role: "role/swarming.taskTriggerer" + principals: "group:project-libyuv-admins" + } } realms { name: "ci" diff --git a/files/libyuv.gni b/files/libyuv.gni index 8df40ba2..0a6c4453 100644 --- a/files/libyuv.gni +++ b/files/libyuv.gni @@ -6,13 +6,14 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. -import("//build_overrides/build.gni") import("//build/config/arm.gni") import("//build/config/mips.gni") +import("//build_overrides/build.gni") declare_args() { libyuv_include_tests = !build_with_chromium libyuv_disable_jpeg = false + libyuv_disable_rvv = false libyuv_use_neon = current_cpu == "arm64" || (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)) diff --git a/files/riscv_script/prepare_toolchain_qemu.sh b/files/riscv_script/prepare_toolchain_qemu.sh new file mode 100755 index 00000000..2a901739 --- /dev/null +++ b/files/riscv_script/prepare_toolchain_qemu.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -ev + +# Download & build RISC-V Clang toolchain & QEMU emulator. +# RISC-V Clang is for cross compile with the RISC-V Vector ISA. +# RISC-V QEMU is used to run the test suite. +# +# Requirements: Linux host w/ working C++ compiler, git, cmake, ninja, wget, tar + +# NOTE: this script must be run from the top-level directory of the LIBYUV_SRC_DIR. + +RISCV_TRIPLE="riscv64-unknown-linux-gnu" +RISCV_QEMU="qemu-riscv64" + +LIBYUV_SRC_DIR=$(pwd) +BUILD_DIR="$LIBYUV_SRC_DIR"/build-toolchain-qemu +INSTALL_QEMU="$BUILD_DIR"/riscv-qemu +INSTALL_CLANG="$BUILD_DIR"/riscv-clang + +LLVM_VERSION="16.0.0" +LLVM_NAME=llvm-project-"$LLVM_VERSION".src + +RISCV_GNU_TOOLCHAIN="$BUILD_DIR"/riscv-gnu-toolchain +RISCV_CLANG_TOOLCHAIN="$BUILD_DIR"/"$LLVM_NAME" + +QEMU_NAME="qemu-7.0.0" + +mkdir -p "$BUILD_DIR" +cd "$BUILD_DIR" + +# Download and install RISC-V GNU Toolchain (needed to build Clang) +if [ ! -d "$RISCV_GNU_TOOLCHAIN" ] +then + git clone git@github.com:riscv/riscv-gnu-toolchain.git + pushd "$RISCV_GNU_TOOLCHAIN" + git submodule update --init --recursive + ./configure --with-cmodel=medany --prefix="$INSTALL_CLANG" + ionice nice make linux -j `nproc` install + popd +fi + +# Download Clang toolchain & build cross compiler +if [ ! -d "$RISCV_CLANG_TOOLCHAIN" ] +then + wget https://github.com/llvm/llvm-project/releases/download/llvmorg-"$LLVM_VERSION"/"$LLVM_NAME".tar.xz + tar xvJf "$LLVM_NAME".tar.xz + pushd "$RISCV_CLANG_TOOLCHAIN" + cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_CLANG" \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_TARGETS_TO_BUILD="RISCV" \ + -DLLVM_ENABLE_PROJECTS="clang" \ + -DLLVM_DEFAULT_TARGET_TRIPLE="$RISCV_TRIPLE" \ + -DLLVM_INSTALL_TOOLCHAIN_ONLY=On \ + -DDEFAULT_SYSROOT=../sysroot \ + -G "Ninja" "$RISCV_CLANG_TOOLCHAIN"/llvm + ionice nice ninja -j `nproc` + ionice nice ninja -j `nproc` install + popd + pushd "$INSTALL_CLANG"/bin + ln -sf clang "$RISCV_TRIPLE"-clang + ln -sf clang++ "$RISCV_TRIPLE"-clang++ + popd +fi + +# Download QEMU and build the riscv64 Linux usermode emulator +if [ ! -d "$QEMU_NAME" ] +then + wget https://download.qemu.org/"$QEMU_NAME".tar.xz + tar xvJf "$QEMU_NAME".tar.xz + pushd "$QEMU_NAME" + ./configure --target-list=riscv64-linux-user --prefix="$INSTALL_QEMU" + ionice nice make -j `nproc` install + popd +fi diff --git a/files/riscv_script/riscv-clang.cmake b/files/riscv_script/riscv-clang.cmake new file mode 100644 index 00000000..47dd5067 --- /dev/null +++ b/files/riscv_script/riscv-clang.cmake @@ -0,0 +1,52 @@ +set(CMAKE_CROSSCOMPILING TRUE) +set(CMAKE_SYSTEM_NAME "Linux") +set(CMAKE_SYSTEM_PROCESSOR "riscv64") + +option(USE_RVV "Enable riscv vector or not." ON) +option(USE_AUTO_VECTORIZER "Enable riscv auto vectorizer or not." OFF) + +# Avoid to use system path for cross-compile +set(CMAKE_FIND_USE_CMAKE_SYSTEM_PATH FALSE) + +set(TOOLCHAIN_PATH "" CACHE STRING "The toolcahin path.") +if(NOT TOOLCHAIN_PATH) + set(TOOLCHAIN_PATH ${CMAKE_SOURCE_DIR}/build-toolchain-qemu/riscv-clang) +endif() + +set(TOOLCHAIN_PREFIX "riscv64-unknown-linux-gnu-" CACHE STRING "The toolcahin prefix.") + +# toolchain setting +set(CMAKE_C_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang") +set(CMAKE_CXX_COMPILER "${TOOLCHAIN_PATH}/bin/${TOOLCHAIN_PREFIX}clang++") + +# CMake will just use the host-side tools for the following tools, so we setup them here. +set(CMAKE_C_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar") +set(CMAKE_CXX_COMPILER_AR "${TOOLCHAIN_PATH}/bin/llvm-ar") +set(CMAKE_C_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib") +set(CMAKE_CXX_COMPILER_RANLIB "${TOOLCHAIN_PATH}/bin/llvm-ranlib") +set(CMAKE_OBJDUMP "${TOOLCHAIN_PATH}/bin/llvm-objdump") +set(CMAKE_OBJCOPY "${TOOLCHAIN_PATH}/bin/llvm-objcopy") + +# compile options +message(STATUS "USE_RVV: ${USE_RVV}") +message(STATUS "USE_AUTO_VECTORIZER: ${USE_AUTO_VECTORIZER}") +set(RISCV_COMPILER_FLAGS) +if(USE_RVV) + list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gcv") + if(NOT USE_AUTO_VECTORIZER) + # Disable auto-vectorizer + add_compile_options(-fno-vectorize -fno-slp-vectorize) + endif() +else() + list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc") +endif() +message(STATUS "RISCV_COMPILER_FLAGS: ${RISCV_COMPILER_FLAGS}") + +set(CMAKE_C_FLAGS "${RISCV_COMPILER_FLAGS} ${CMAKE_C_FLAGS}") +set(CMAKE_CXX_FLAGS "${RISCV_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}") + +set(RISCV_LINKER_FLAGS "-lstdc++ -lpthread -lm -ldl") +set(RISCV_LINKER_FLAGS_EXE) +set(CMAKE_SHARED_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}") +set(CMAKE_MODULE_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}") +set(CMAKE_EXE_LINKER_FLAGS "${RISCV_LINKER_FLAGS} ${RISCV_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}") diff --git a/files/riscv_script/run_qemu.sh b/files/riscv_script/run_qemu.sh new file mode 100755 index 00000000..080af3b1 --- /dev/null +++ b/files/riscv_script/run_qemu.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -x +set -e + +USE_RVV="${USE_RVV:-OFF}" +TOOLCHAIN_PATH="${TOOLCHAIN_PATH:-../../build-toolchain-qemu/riscv-clang}" +QEMU_PREFIX_PATH="${QEMU_PREFIX_PATH:-../../build-toolchain-qemu/riscv-qemu/}" + +if [ "${USE_RVV}" = "ON" ];then + QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0 -L ${TOOLCHAIN_PATH}/sysroot" +else + QEMU_OPTION="-cpu rv64,zba=true,zbb=true,zbc=true,zbs=true -L ${TOOLCHAIN_PATH}/sysroot" +fi + +$QEMU_PREFIX_PATH/bin/qemu-riscv64 $QEMU_OPTION $@ diff --git a/files/source/compare.cc b/files/source/compare.cc index d4713b60..50a736bd 100644 --- a/files/source/compare.cc +++ b/files/source/compare.cc @@ -45,7 +45,7 @@ uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { } #endif - while (count >= (uint64_t)(kBlockSize)) { + while (count >= (uint64_t)kBlockSize) { seed = HashDjb2_SSE(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; @@ -359,10 +359,10 @@ static double Ssim8x8_C(const uint8_t* src_a, (sum_a_sq + sum_b_sq + c1) * (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); - if (ssim_d == 0.0) { + if (ssim_d == 0) { return DBL_MAX; } - return ssim_n * 1.0 / ssim_d; + return (double)ssim_n / (double)ssim_d; } } diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc index b834b42a..33cbe25d 100644 --- a/files/source/compare_gcc.cc +++ b/files/source/compare_gcc.cc @@ -67,7 +67,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, : : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); - return static_cast(diff); + return (uint32_t)(diff); } #else uint32_t HammingDistance_SSE42(const uint8_t* src_a, diff --git a/files/source/compare_mmi.cc b/files/source/compare_mmi.cc deleted file mode 100644 index 7640d946..00000000 --- a/files/source/compare_mmi.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" - -#include "libyuv/compare_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// Hakmem method for hamming distance. -uint32_t HammingDistance_MMI(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0; - uint64_t c1 = 0x5555555555555555; - uint64_t c2 = 0x3333333333333333; - uint64_t c3 = 0x0f0f0f0f0f0f0f0f; - uint32_t c4 = 0x01010101; - uint64_t s1 = 1, s2 = 2, s3 = 4; - __asm__ volatile( - "1: \n\t" - "ldc1 %[ta], 0(%[src_a]) \n\t" - "ldc1 %[tb], 0(%[src_b]) \n\t" - "xor %[temp], %[ta], %[tb] \n\t" - "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1 - "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1 - "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1 - "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2) - "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2 - "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2 - "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t - "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4 - "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4) - "and %[temp1], %[temp1], %[c3] \n\t" //&c3 - "dmfc1 $t0, %[temp1] \n\t" - "dsrl32 $t0, $t0, 0 \n\t " - "mul $t0, $t0, %[c4] \n\t" - "dsrl $t0, $t0, 24 \n\t" - "dadd %[diff], %[diff], $t0 \n\t" - "dmfc1 $t0, %[temp1] \n\t" - "mul $t0, $t0, %[c4] \n\t" - "dsrl $t0, $t0, 24 \n\t" - "dadd %[diff], %[diff], $t0 \n\t" - "daddiu %[src_a], %[src_a], 8 \n\t" - "daddiu %[src_b], %[src_b], 8 \n\t" - "addiu %[count], %[count], -8 \n\t" - "bgtz %[count], 1b \n\t" - "nop \n\t" - : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b), - [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp), - [temp1] "+f"(temp1) - : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1), - [s2] "f"(s2), [s3] "f"(s3) - : "memory"); - return diff; -} - -uint32_t SumSquareError_MMI(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t sse = 0u; - uint32_t sse_hi = 0u, sse_lo = 0u; - - uint64_t src1, src2; - uint64_t diff, diff_hi, diff_lo; - uint64_t sse_sum, sse_tmp; - - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t" - - "1: \n\t" - "ldc1 %[src1], 0x00(%[src_a]) \n\t" - "ldc1 %[src2], 0x00(%[src_b]) \n\t" - "pasubub %[diff], %[src1], %[src2] \n\t" - "punpcklbh %[diff_lo], %[diff], %[mask] \n\t" - "punpckhbh %[diff_hi], %[diff], %[mask] \n\t" - "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t" - "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" - "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t" - "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" - - "daddiu %[src_a], %[src_a], 0x08 \n\t" - "daddiu %[src_b], %[src_b], 0x08 \n\t" - "daddiu %[count], %[count], -0x08 \n\t" - "bnez %[count], 1b \n\t" - - "mfc1 %[sse_lo], %[sse_sum] \n\t" - "mfhc1 %[sse_hi], %[sse_sum] \n\t" - "daddu %[sse], %[sse_hi], %[sse_lo] \n\t" - : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1), - [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi), - [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp), - [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo) - : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count), - [mask] "f"(mask) - : "memory"); - - return sse; -} - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/files/source/convert.cc b/files/source/convert.cc index 7178580f..b11ab1bf 100644 --- a/files/source/convert.cc +++ b/files/source/convert.cc @@ -24,6 +24,10 @@ namespace libyuv { extern "C" { #endif +// Subsample amount uses a shift. +// v is value +// a is amount to add to round up +// s is shift to subsample down #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) static __inline int Abs(int v) { return v >= 0 ? v : -v; @@ -199,6 +203,99 @@ static int Planar16bitTo8bit(const uint16_t* src_y, return 0; } +static int I41xToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + const int scale = 1 << (24 - depth); + + if (width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + { + const int uv_width = SUBSAMPLE(width, 1, 1); + const int uv_height = SUBSAMPLE(height, 1, 1); + + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_u, + dst_stride_u, src_u, dst_u, scale, kFilterBilinear); + ScalePlaneDown2_16To8(width, height, uv_width, uv_height, src_stride_v, + dst_stride_v, src_v, dst_v, scale, kFilterBilinear); + } + return 0; +} + +static int I21xToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int depth) { + const int scale = 1 << (24 - depth); + + if (width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + { + const int uv_width = SUBSAMPLE(width, 1, 1); + const int uv_height = SUBSAMPLE(height, 1, 1); + const int dy = FixedDiv(height, uv_height); + + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, + dst_stride_u, src_u, dst_u, 0, 32768, dy, + /*bpp=*/1, scale, kFilterBilinear); + ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, + dst_stride_v, src_v, dst_v, 0, 32768, dy, + /*bpp=*/1, scale, kFilterBilinear); + } + return 0; +} + // Convert 10 bit YUV to 8 bit. LIBYUV_API int I010ToI420(const uint16_t* src_y, @@ -236,38 +333,9 @@ int I210ToI420(const uint16_t* src_y, int dst_stride_v, int width, int height) { - const int depth = 10; - const int scale = 1 << (24 - depth); - - if (width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - { - const int uv_width = SUBSAMPLE(width, 1, 1); - const int uv_height = SUBSAMPLE(height, 1, 1); - const int dy = FixedDiv(height, uv_height); - - Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, - height); - ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, - dst_stride_u, src_u, dst_u, 0, 32768, dy, - /*bpp=*/1, scale, kFilterBilinear); - ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, - dst_stride_v, src_v, dst_v, 0, 32768, dy, - /*bpp=*/1, scale, kFilterBilinear); - } - return 0; + return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 10); } LIBYUV_API @@ -291,6 +359,26 @@ int I210ToI422(const uint16_t* src_y, 0, 10); } +LIBYUV_API +int I410ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 10); +} + LIBYUV_API int I410ToI444(const uint16_t* src_y, int src_stride_y, @@ -354,6 +442,26 @@ int I212ToI422(const uint16_t* src_y, 0, 12); } +LIBYUV_API +int I212ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I21xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 12); +} + LIBYUV_API int I412ToI444(const uint16_t* src_y, int src_stride_y, @@ -375,6 +483,26 @@ int I412ToI444(const uint16_t* src_y, 0, 12); } +LIBYUV_API +int I412ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I41xToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, 12); +} + // Any Ix10 To I010 format with mirroring. static int Ix10ToI010(const uint16_t* src_y, int src_stride_y, @@ -713,6 +841,110 @@ int MM21ToI420(const uint8_t* src_y, return 0; } +LIBYUV_API +int MM21ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + if (!src_y || !src_uv || !dst_yuy2 || width <= 0) { + return -1; + } + + DetileToYUY2(src_y, src_stride_y, src_uv, src_stride_uv, dst_yuy2, + dst_stride_yuy2, width, height, 32); + + return 0; +} + +// Convert MT2T into P010. See tinyurl.com/mtk-10bit-video-format for format +// documentation. +// TODO(greenjustin): Add an MT2T to I420 conversion. +LIBYUV_API +int MT2TToP010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width <= 0 || !height || !src_uv || !dst_uv) { + return -1; + } + + { + int uv_width = (width + 1) & ~1; + int uv_height = (height + 1) / 2; + int y = 0; + const int tile_width = 16; + const int y_tile_height = 32; + const int uv_tile_height = 16; + int padded_width = (width + tile_width - 1) & ~(tile_width - 1); + int y_tile_row_size = padded_width * y_tile_height * 10 / 8; + int uv_tile_row_size = padded_width * uv_tile_height * 10 / 8; + size_t row_buf_size = padded_width * y_tile_height * sizeof(uint16_t); + void (*UnpackMT2T)(const uint8_t* src, uint16_t* dst, size_t size) = + UnpackMT2T_C; + align_buffer_64(row_buf, row_buf_size); + +#if defined(HAS_UNPACKMT2T_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UnpackMT2T = UnpackMT2T_NEON; + } +#endif + // Negative height means invert the image. + if (height < 0) { + height = -height; + uv_height = (height + 1) / 2; + if (dst_y) { + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + dst_uv = dst_uv + (uv_height - 1) * dst_stride_uv; + dst_stride_uv = -dst_stride_uv; + } + + // Unpack and detile Y in rows of tiles + if (src_y && dst_y) { + for (y = 0; y < (height & ~(y_tile_height - 1)); y += y_tile_height) { + UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size); + DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y, + width, y_tile_height, y_tile_height); + src_y += src_stride_y * y_tile_height; + dst_y += dst_stride_y * y_tile_height; + } + if (height & (y_tile_height - 1)) { + UnpackMT2T(src_y, (uint16_t*)row_buf, y_tile_row_size); + DetilePlane_16((uint16_t*)row_buf, padded_width, dst_y, dst_stride_y, + width, height & (y_tile_height - 1), y_tile_height); + } + } + + // Unpack and detile UV plane + for (y = 0; y < (uv_height & ~(uv_tile_height - 1)); y += uv_tile_height) { + UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size); + DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv, + uv_width, uv_tile_height, uv_tile_height); + src_uv += src_stride_uv * uv_tile_height; + dst_uv += dst_stride_uv * uv_tile_height; + } + if (uv_height & (uv_tile_height - 1)) { + UnpackMT2T(src_uv, (uint16_t*)row_buf, uv_tile_row_size); + DetilePlane_16((uint16_t*)row_buf, padded_width, dst_uv, dst_stride_uv, + uv_width, uv_height & (uv_tile_height - 1), + uv_tile_height); + } + free_aligned_buffer_64(row_buf); + } + return 0; +} + #ifdef I422TONV21_ROW_VERSION // Unittest fails for this version. // 422 chroma is 1/2 width, 1x height @@ -734,7 +966,7 @@ int I422ToNV21(const uint8_t* src_y, int y; void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; int halfwidth = (width + 1) >> 1; @@ -764,11 +996,19 @@ int I422ToNV21(const uint8_t* src_y, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_AVX2; } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow = MergeUVRow_Any_NEON; @@ -793,6 +1033,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -833,6 +1078,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height); @@ -1118,6 +1368,70 @@ int NV16ToNV24(const uint8_t* src_y, return 0; } +// Any P[420]1[02] to I[420]1[02] format with mirroring. +static int PxxxToIxxx(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y, + int depth) { + const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + if (width <= 0 || height == 0) { + return -1; + } + ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, + depth); + SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, uv_width, uv_height, depth); + return 0; +} + +LIBYUV_API +int P010ToI010(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, + width, height, 1, 1, 10); +} + +LIBYUV_API +int P012ToI012(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + return PxxxToIxxx(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, + width, height, 1, 1, 12); +} + LIBYUV_API int P010ToP410(const uint16_t* src_y, int src_stride_y, @@ -1231,6 +1545,16 @@ int YUY2ToI420(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToYRow = YUY2ToYRow_Any_LSX; + YUY2ToUVRow = YUY2ToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_LSX; + YUY2ToUVRow = YUY2ToUVRow_LSX; + } + } +#endif #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { YUY2ToYRow = YUY2ToYRow_Any_LASX; @@ -1322,6 +1646,26 @@ int UYVYToI420(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUVRow = UYVYToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUVRow = UYVYToUVRow_LSX; + } + } +#endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUVRow = UYVYToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUVRow = UYVYToUVRow_LSX; + } + } +#endif #if defined(HAS_UYVYTOYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { UYVYToYRow = UYVYToYRow_Any_LASX; @@ -1574,6 +1918,176 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + ARGBToUVRow = ARGBToUVRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + ARGBToUVRow = ARGBToUVRow_LASX; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + src_argb += src_stride_argb * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToYRow(src_argb, dst_y, width); + } + return 0; +} + +#ifdef USE_EXTRACTALPHA +// Convert ARGB to I420 with Alpha +// The following version calls ARGBExtractAlpha on the full image. +LIBYUV_API +int ARGBToI420Alpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + int r = ARGBToI420(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height); + if (r == 0) { + r = ARGBExtractAlpha(src_argb, src_stride_argb, dst_a, dst_stride_a, width, + height); + } + return r; +} +#else // USE_EXTRACTALPHA +// Convert ARGB to I420 with Alpha +LIBYUV_API +int ARGBToI420Alpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + int y; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, + int width) = ARGBExtractAlphaRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || !dst_a || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -1584,22 +2098,58 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 + : ARGBExtractAlphaRow_Any_SSE2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 + : ARGBExtractAlphaRow_Any_AVX2; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON + : ARGBExtractAlphaRow_Any_NEON; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA + : ARGBExtractAlphaRow_Any_MSA; + } +#endif +#if defined(HAS_ARGBEXTRACTALPHAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_LSX + : ARGBExtractAlphaRow_Any_LSX; + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); + ARGBExtractAlphaRow(src_argb, dst_a, width); + ARGBExtractAlphaRow(src_argb + src_stride_argb, dst_a + dst_stride_a, + width); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; + dst_a += dst_stride_a * 2; } if (height & 1) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); + ARGBExtractAlphaRow(src_argb, dst_a, width); } return 0; } +#endif // USE_EXTRACTALPHA // Convert BGRA to I420. LIBYUV_API @@ -1628,16 +2178,6 @@ int BGRAToI420(const uint8_t* src_bgra, src_bgra = src_bgra + (height - 1) * src_stride_bgra; src_stride_bgra = -src_stride_bgra; } -#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - BGRAToUVRow = BGRAToUVRow_Any_SSSE3; - BGRAToYRow = BGRAToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_SSSE3; - BGRAToYRow = BGRAToYRow_SSSE3; - } - } -#endif #if defined(HAS_BGRATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { BGRAToYRow = BGRAToYRow_Any_NEON; @@ -1654,12 +2194,46 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BGRAToYRow = BGRAToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_SSSE3; + } + } +#endif +#if defined(HAS_BGRATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + BGRAToUVRow = BGRAToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_SSSE3; + } + } +#endif +#if defined(HAS_BGRATOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BGRAToYRow = BGRAToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + BGRAToYRow = BGRAToYRow_AVX2; + } + } +#endif +#if defined(HAS_BGRATOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + BGRAToUVRow = BGRAToUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + BGRAToUVRow = BGRAToUVRow_AVX2; + } + } +#endif #if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { BGRAToYRow = BGRAToYRow_Any_MSA; BGRAToUVRow = BGRAToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { BGRAToYRow = BGRAToYRow_MSA; + } + if (IS_ALIGNED(width, 32)) { BGRAToUVRow = BGRAToUVRow_MSA; } } @@ -1674,6 +2248,19 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_BGRATOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + BGRAToYRow = BGRAToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + BGRAToYRow = BGRAToYRow_LASX; + } + } +#endif +#if defined(HAS_BGRATOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + BGRAToYRow = BGRAToYRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); @@ -1786,6 +2373,19 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYRow = ABGRToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); @@ -1882,6 +2482,19 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_RGBATOYROW_LASX) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToYRow = RGBAToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGBAToYRow = RGBAToYRow_LASX; + } + } +#endif +#if defined(HAS_RGBATOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGBAToYRow = RGBAToYRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); @@ -1901,7 +2514,7 @@ int RGBAToI420(const uint8_t* src_rgba, // Enabled if 1 pass is available #if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ - defined(HAS_RGB24TOYROW_LSX)) + defined(HAS_RGB24TOYROW_LSX) || defined(HAS_RGB24TOYROW_RVV)) #define HAS_RGB24TOYROW #endif @@ -1986,6 +2599,11 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYRow = RGB24ToYRow_RVV; + } +#endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYROW @@ -2035,8 +2653,8 @@ int RGB24ToI420(const uint8_t* src_rgb24, { #if !defined(HAS_RGB24TOYROW) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); #endif for (y = 0; y < height - 1; y += 2) { @@ -2046,10 +2664,10 @@ int RGB24ToI420(const uint8_t* src_rgb24, RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); #endif src_rgb24 += src_stride_rgb24 * 2; dst_y += dst_stride_y * 2; @@ -2075,7 +2693,8 @@ int RGB24ToI420(const uint8_t* src_rgb24, #undef HAS_RGB24TOYROW // Enabled if 1 pass is available -#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) +#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ + defined(HAS_RGB24TOYJROW_RVV) #define HAS_RGB24TOYJROW #endif @@ -2140,6 +2759,27 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToYJRow = RGB24ToYJRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYJRow = RGB24ToYJRow_RVV; + } +#endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYJROW @@ -2189,8 +2829,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24, { #if !defined(HAS_RGB24TOYJROW) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); #endif for (y = 0; y < height - 1; y += 2) { @@ -2200,10 +2840,10 @@ int RGB24ToJ420(const uint8_t* src_rgb24, RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); - ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + row_size, width); + ARGBToUVJRow(row, row_size, dst_u, dst_v, width); ARGBToYJRow(row, dst_y, width); - ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width); #endif src_rgb24 += src_stride_rgb24 * 2; dst_y += dst_stride_y * 2; @@ -2230,7 +2870,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24, // Enabled if 1 pass is available #if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_LSX)) + defined(HAS_RAWTOYROW_LSX) || defined(HAS_RAWTOYROW_RVV)) #define HAS_RAWTOYROW #endif @@ -2314,6 +2954,11 @@ int RAWToI420(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYRow = RAWToYRow_RVV; + } +#endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYROW @@ -2363,8 +3008,8 @@ int RAWToI420(const uint8_t* src_raw, { #if !defined(HAS_RAWTOYROW) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); #endif for (y = 0; y < height - 1; y += 2) { @@ -2374,10 +3019,10 @@ int RAWToI420(const uint8_t* src_raw, RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); #else RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); #endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; @@ -2403,7 +3048,8 @@ int RAWToI420(const uint8_t* src_raw, #undef HAS_RAWTOYROW // Enabled if 1 pass is available -#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \ + defined(HAS_RAWTOYJROW_RVV) #define HAS_RAWTOYJROW #endif @@ -2468,6 +3114,27 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToYJRow = RAWToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYJROW @@ -2517,8 +3184,8 @@ int RAWToJ420(const uint8_t* src_raw, { #if !defined(HAS_RAWTOYJROW) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); #endif for (y = 0; y < height - 1; y += 2) { @@ -2528,10 +3195,10 @@ int RAWToJ420(const uint8_t* src_raw, RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); #else RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); + RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); + ARGBToUVJRow(row, row_size, dst_u, dst_v, width); ARGBToYJRow(row, dst_y, width); - ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width); #endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; @@ -2695,8 +3362,8 @@ int RGB565ToI420(const uint8_t* src_rgb565, #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ defined(HAS_RGB565TOYROW_LSX) || defined(HAS_RGB565TOYROW_LASX)) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); #endif for (y = 0; y < height - 1; y += 2) { #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ @@ -2706,10 +3373,10 @@ int RGB565ToI420(const uint8_t* src_rgb565, RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); #else RGB565ToARGBRow(src_rgb565, row, width); - RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + row_size, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); #endif src_rgb565 += src_stride_rgb565 * 2; dst_y += dst_stride_y * 2; @@ -2875,8 +3542,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ defined(HAS_ARGB1555TOYROW_LSX) || defined(HAS_ARGB1555TOYROW_LASX)) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); #endif for (y = 0; y < height - 1; y += 2) { @@ -2888,11 +3555,11 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, width); #else ARGB1555ToARGBRow(src_argb1555, row, width); - ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + row_size, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); #endif src_argb1555 += src_stride_argb1555 * 2; dst_y += dst_stride_y * 2; @@ -3055,6 +3722,24 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -3070,8 +3755,8 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, { #if !(defined(HAS_ARGB4444TOYROW_NEON)) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); #endif for (y = 0; y < height - 1; y += 2) { @@ -3082,11 +3767,11 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, width); #else ARGB4444ToARGBRow(src_argb4444, row, width); - ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + row_size, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); + ARGBToUVRow(row, row_size, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGBToYRow(row + row_size, dst_y + dst_stride_y, width); #endif src_argb4444 += src_stride_argb4444 * 2; dst_y += dst_stride_y * 2; @@ -3167,6 +3852,27 @@ int RGB24ToJ400(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToYJRow = RGB24ToYJRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToYJRow = RGB24ToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToYJRow = RGB24ToYJRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToYJRow = RGB24ToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { RGB24ToYJRow(src_rgb24, dst_yj, width); @@ -3235,6 +3941,27 @@ int RAWToJ400(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToYJRow = RAWToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { RAWToYJRow(src_raw, dst_yj, width); diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc index 71ef8c10..cc6560de 100644 --- a/files/source/convert_argb.cc +++ b/files/source/convert_argb.cc @@ -7,8 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + #include "libyuv/convert_argb.h" +#include + +#include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" @@ -65,6 +69,7 @@ int I420ToARGBMatrix(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -115,6 +120,14 @@ int I420ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422TOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGBRow = I422ToARGBRow_Any_LASX; @@ -123,6 +136,11 @@ int I420ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -298,6 +316,7 @@ int I422ToARGBMatrix(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -355,6 +374,14 @@ int I422ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422TOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGBRow = I422ToARGBRow_Any_LASX; @@ -363,6 +390,11 @@ int I422ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -536,6 +568,7 @@ int I444ToARGBMatrix(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I444ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -592,6 +625,11 @@ int I444ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I444TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToARGBRow = I444ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -747,6 +785,133 @@ int U444ToABGR(const uint8_t* src_y, width, height); } +// Convert I444 to RGB24 with matrix. +LIBYUV_API +int I444ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToRGB24Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } + // Coalesce rows. + if (src_stride_y == width && src_stride_u == width && src_stride_v == width && + dst_stride_rgb24 == width * 3) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_rgb24 = 0; + } +#if defined(HAS_I444TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I444ToRGB24Row = I444ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToRGB24Row = I444ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I444ToRGB24Row = I444ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToRGB24Row = I444ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToRGB24Row = I444ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I444ToRGB24Row = I444ToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I444ToRGB24Row = I444ToRGB24Row_LSX; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToRGB24Row = I444ToRGB24Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I444 to RGB24. +LIBYUV_API +int I444ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I444ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); +} + +// Convert I444 to RAW. +LIBYUV_API +int I444ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I444ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + // Convert 10 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. @@ -767,6 +932,7 @@ int I010ToAR30Matrix(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I210ToAR30Row_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -926,6 +1092,7 @@ int I012ToAR30Matrix(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I212ToAR30Row_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -983,6 +1150,7 @@ int I210ToAR30Matrix(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I210ToAR30Row_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -1137,6 +1305,7 @@ int I410ToAR30Matrix(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I410ToAR30Row_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -1190,6 +1359,7 @@ int I010ToARGBMatrix(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I210ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1353,6 +1523,7 @@ int I012ToARGBMatrix(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I212ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1408,6 +1579,7 @@ int I210ToARGBMatrix(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I210ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1568,6 +1740,7 @@ int I410ToARGBMatrix(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I410ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1617,6 +1790,7 @@ int P010ToARGBMatrix(const uint16_t* src_y, void (*P210ToARGBRow)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1667,6 +1841,7 @@ int P210ToARGBMatrix(const uint16_t* src_y, void (*P210ToARGBRow)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1715,6 +1890,7 @@ int P010ToAR30Matrix(const uint16_t* src_y, void (*P210ToAR30Row)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -1765,6 +1941,7 @@ int P210ToAR30Matrix(const uint16_t* src_y, void (*P210ToAR30Row)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -1823,6 +2000,7 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, int width) = I422AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1865,6 +2043,14 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422ALPHATOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX; @@ -1873,6 +2059,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -1905,6 +2096,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -1947,6 +2143,7 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, int width) = I422AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1989,6 +2186,14 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422ALPHATOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX; @@ -1997,6 +2202,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -2029,6 +2239,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2069,6 +2284,7 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, int width) = I444AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -2111,6 +2327,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -2143,6 +2364,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2312,6 +2538,7 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, int width) = I210AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -2370,6 +2597,11 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2412,6 +2644,7 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, int width) = I210AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -2470,6 +2703,11 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2510,6 +2748,7 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, int width) = I410AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -2568,6 +2807,11 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2597,6 +2841,7 @@ int I400ToARGBMatrix(const uint8_t* src_y, void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I400ToARGBRow_C; + assert(yuvconstants); if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -2652,6 +2897,11 @@ int I400ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I400TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I400ToARGBRow = I400ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, yuvconstants, width); @@ -2739,6 +2989,12 @@ int J400ToARGB(const uint8_t* src_y, } } #endif +#if defined(HAS_J400TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + J400ToARGBRow = J400ToARGBRow_RVV; + } +#endif + for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; @@ -2901,6 +3157,11 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToARGBRow = RGB24ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); @@ -2976,6 +3237,11 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToARGBRow = RAWToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); @@ -3027,6 +3293,11 @@ int RAWToRGBA(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTORGBAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToRGBARow = RAWToRGBARow_RVV; + } +#endif for (y = 0; y < height; ++y) { RAWToRGBARow(src_raw, dst_rgba, width); @@ -3431,6 +3702,11 @@ int AR64ToARGB(const uint16_t* src_ar64, } } #endif +#if defined(HAS_AR64TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + AR64ToARGBRow = AR64ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { AR64ToARGBRow(src_ar64, dst_argb, width); @@ -3490,6 +3766,11 @@ int AB64ToARGB(const uint16_t* src_ab64, } } #endif +#if defined(HAS_AB64TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + AB64ToARGBRow = AB64ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { AB64ToARGBRow(src_ab64, dst_argb, width); @@ -3514,6 +3795,7 @@ int NV12ToARGBMatrix(const uint8_t* src_y, void (*NV12ToARGBRow)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -3598,6 +3880,7 @@ int NV21ToARGBMatrix(const uint8_t* src_y, void (*NV21ToARGBRow)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; + assert(yuvconstants); if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -3741,6 +4024,7 @@ int NV12ToRGB24Matrix(const uint8_t* src_y, void (*NV12ToRGB24Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) { return -1; } @@ -3801,6 +4085,7 @@ int NV21ToRGB24Matrix(const uint8_t* src_y, void (*NV21ToRGB24Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C; + assert(yuvconstants); if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) { return -1; } @@ -4143,6 +4428,7 @@ int Android420ToARGBMatrix(const uint8_t* src_y, const ptrdiff_t vu_off = src_v - src_u; int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -4243,6 +4529,7 @@ int I422ToRGBAMatrix(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGBARow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } @@ -4284,6 +4571,14 @@ int I422ToRGBAMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGBAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGBARow = I422ToRGBARow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_LSX; + } + } +#endif #if defined(HAS_I422TORGBAROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGBARow = I422ToRGBARow_Any_LASX; @@ -4292,6 +4587,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGBAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGBARow = I422ToRGBARow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); @@ -4354,6 +4654,7 @@ int NV12ToRGB565Matrix(const uint8_t* src_y, void (*NV12ToRGB565Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -4456,6 +4757,7 @@ int I420ToRGBAMatrix(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGBARow_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } @@ -4497,6 +4799,14 @@ int I420ToRGBAMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGBAROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGBARow = I422ToRGBARow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_LSX; + } + } +#endif #if defined(HAS_I422TORGBAROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGBARow = I422ToRGBARow_Any_LASX; @@ -4505,6 +4815,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGBAROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGBARow = I422ToRGBARow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); @@ -4572,6 +4887,7 @@ int I420ToRGB24Matrix(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB24Row_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { return -1; } @@ -4613,6 +4929,14 @@ int I420ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_LSX; + } + } +#endif #if defined(HAS_I422TORGB24ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGB24Row = I422ToRGB24Row_Any_LASX; @@ -4621,6 +4945,11 @@ int I420ToRGB24Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGB24Row = I422ToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); @@ -4742,70 +5071,206 @@ int H420ToRAW(const uint8_t* src_y, width, height); } -// Convert I420 to ARGB1555. +// Convert I422 to RGB24 with matrix. LIBYUV_API -int I420ToARGB1555(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb1555, - int dst_stride_argb1555, - int width, - int height) { +int I422ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; - void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, - const uint8_t* v_buf, uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGB1555Row_C; - if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || - height == 0) { + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; - dst_stride_argb1555 = -dst_stride_argb1555; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; } -#if defined(HAS_I422TOARGB1555ROW_SSSE3) +#if defined(HAS_I422TORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } #endif -#if defined(HAS_I422TOARGB1555ROW_AVX2) +#if defined(HAS_I422TORGB24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_AVX2; } } #endif -#if defined(HAS_I422TOARGB1555ROW_NEON) +#if defined(HAS_I422TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_NEON; + I422ToRGB24Row = I422ToRGB24Row_NEON; } } #endif -#if defined(HAS_I422TOARGB1555ROW_MSA) +#if defined(HAS_I422TORGB24ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_MSA; + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; } } #endif -#if defined(HAS_I422TOARGB1555ROW_LASX) +#if defined(HAS_I422TORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB24Row = I422ToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_LSX; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { - I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX; - if (IS_ALIGNED(width, 8)) { - I422ToARGB1555Row = I422ToARGB1555Row_LASX; + I422ToRGB24Row = I422ToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I422ToRGB24Row = I422ToRGB24Row_LASX; + } + } +#endif +#if defined(HAS_I422TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToRGB24Row = I422ToRGB24Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I422 to RGB24. +LIBYUV_API +int I422ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); +} + +// Convert I422 to RAW. +LIBYUV_API +int I422ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I422ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert I420 to ARGB1555. +LIBYUV_API +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { + int y; + void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) = I422ToARGB1555Row_C; + if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; + dst_stride_argb1555 = -dst_stride_argb1555; + } +#if defined(HAS_I422TOARGB1555ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_NEON; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_MSA; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_LSX; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_LASX; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_LASX; } } #endif @@ -4882,6 +5347,14 @@ int I420ToARGB4444(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGB4444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_LSX; + } + } +#endif #if defined(HAS_I422TOARGB4444ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_LASX; @@ -4922,6 +5395,7 @@ int I420ToRGB565Matrix(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB565Row_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -4963,6 +5437,14 @@ int I420ToRGB565Matrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_LSX; + } + } +#endif #if defined(HAS_I422TORGB565ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGB565Row = I422ToRGB565Row_Any_LASX; @@ -5035,23 +5517,25 @@ int H420ToRGB565(const uint8_t* src_y, &kYuvH709Constants, width, height); } -// Convert I422 to RGB565. +// Convert I422 to RGB565 with specified color matrix. LIBYUV_API -int I422ToRGB565(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_rgb565, - int dst_stride_rgb565, - int width, - int height) { +int I422ToRGB565Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB565Row_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -5093,6 +5577,14 @@ int I422ToRGB565(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToRGB565Row = I422ToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_LSX; + } + } +#endif #if defined(HAS_I422TORGB565ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToRGB565Row = I422ToRGB565Row_Any_LASX; @@ -5103,7 +5595,7 @@ int I422ToRGB565(const uint8_t* src_y, #endif for (y = 0; y < height; ++y) { - I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; src_u += src_stride_u; @@ -5112,6 +5604,23 @@ int I422ToRGB565(const uint8_t* src_y, return 0; } +// Convert I422 to RGB565. +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + return I422ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb565, dst_stride_rgb565, + &kYuvI601Constants, width, height); +} + // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. static const uint8_t kDither565_4x4[16] = { 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, @@ -5136,7 +5645,7 @@ int I420ToRGB565Dither(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, int width) = + uint32_t dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; @@ -5191,6 +5700,14 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422TOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGBRow = I422ToARGBRow_Any_LASX; @@ -5199,6 +5716,11 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; @@ -5231,6 +5753,14 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX; + } + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; @@ -5278,6 +5808,7 @@ int I420ToAR30Matrix(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) = I422ToAR30Row_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -5401,9 +5932,12 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I444ToARGBRow_C; - void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_Any_C; + void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, + int dst_width) = ScaleRowUp2_Bilinear_Any_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -5453,48 +5987,57 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y, } } #endif +#if defined(HAS_I444TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToARGBRow = I444ToARGBRow_RVV; + } +#endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) +#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) +#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) +#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_NEON) +#if defined(HAS_SCALEROWUP2_BILINEAR_NEON) if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif // alloc 4 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 4); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4); uint8_t* temp_u_1 = row; - uint8_t* temp_u_2 = row + kRowSize; - uint8_t* temp_v_1 = row + kRowSize * 2; - uint8_t* temp_v_2 = row + kRowSize * 3; + uint8_t* temp_u_2 = row + row_size; + uint8_t* temp_v_1 = row + row_size * 2; + uint8_t* temp_v_2 = row + row_size * 3; - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; for (y = 0; y < height - 2; y += 2) { - Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width); I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -5506,8 +6049,8 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y, } if (!(height & 1)) { - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); I444ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); } @@ -5531,8 +6074,9 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I444ToARGBRow_C; - void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = - ScaleRowUp2_Linear_Any_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -5582,36 +6126,41 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y, } } #endif +#if defined(HAS_I444TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToARGBRow = I444ToARGBRow_RVV; + } +#endif #if defined(HAS_SCALEROWUP2_LINEAR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif // alloc 2 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2); uint8_t* temp_u = row; - uint8_t* temp_v = row + kRowSize; + uint8_t* temp_v = row + row_size; for (y = 0; y < height; ++y) { - ScaleRowUp(src_u, temp_u, width); - ScaleRowUp(src_v, temp_v, width); + ScaleRowUp2_Linear(src_u, temp_u, width); + ScaleRowUp2_Linear(src_v, temp_v, width); I444ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -5623,6 +6172,148 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y, return 0; } +static int I420ToRGB24MatrixBilinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToRGB24Row_C; + void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, + int dst_width) = ScaleRowUp2_Bilinear_Any_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I444TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I444ToRGB24Row = I444ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToRGB24Row = I444ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I444ToRGB24Row = I444ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToRGB24Row = I444ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToRGB24Row = I444ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_MSA; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + I444ToRGB24Row = I444ToRGB24Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + I444ToRGB24Row = I444ToRGB24Row_LASX; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToRGB24Row = I444ToRGB24Row_RVV; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 4 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4); + uint8_t* temp_u_1 = row; + uint8_t* temp_u_2 = row + row_size; + uint8_t* temp_v_1 = row + row_size * 2; + uint8_t* temp_v_2 = row + row_size * 3; + + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); + I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + + for (y = 0; y < height - 2; y += 2) { + Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width); + I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + I444ToRGB24Row(src_y, temp_u_2, temp_v_2, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + if (!(height & 1)) { + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); + I444ToRGB24Row(src_y, temp_u_1, temp_v_1, dst_rgb24, yuvconstants, width); + } + + free_aligned_buffer_64(row); + return 0; +} + static int I010ToAR30MatrixBilinear(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, @@ -5639,9 +6330,12 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I410ToAR30Row_C; - void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_16_Any_C; + void (*Scale2RowUp_Bilinear_12)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -5668,41 +6362,44 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON) if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; } #endif // alloc 4 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t)); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4 * sizeof(uint16_t)); uint16_t* temp_u_1 = (uint16_t*)(row); - uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize; - uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2; - uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3; + uint16_t* temp_u_2 = (uint16_t*)(row) + row_size; + uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3; - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; for (y = 0; y < height - 2; y += 2) { - Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width); I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; @@ -5714,8 +6411,8 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y, } if (!(height & 1)) { - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); I410ToAR30Row(src_y, temp_u_1, temp_v_1, dst_ar30, yuvconstants, width); } @@ -5740,8 +6437,9 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I410ToAR30Row_C; - void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, - int dst_width) = ScaleRowUp2_Linear_16_Any_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -5770,29 +6468,29 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y, #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; } #endif // alloc 2 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); uint16_t* temp_u = (uint16_t*)(row); - uint16_t* temp_v = (uint16_t*)(row) + kRowSize; + uint16_t* temp_v = (uint16_t*)(row) + row_size; for (y = 0; y < height; ++y) { - ScaleRowUp(src_u, temp_u, width); - ScaleRowUp(src_v, temp_v, width); + ScaleRowUp2_Linear_12(src_u, temp_u, width); + ScaleRowUp2_Linear_12(src_v, temp_v, width); I410ToAR30Row(src_y, temp_u, temp_v, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; @@ -5819,9 +6517,12 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I410ToARGBRow_C; - void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_16_Any_C; + void (*Scale2RowUp_Bilinear_12)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -5848,41 +6549,44 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y, } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON) if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; } #endif // alloc 4 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t)); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4 * sizeof(uint16_t)); uint16_t* temp_u_1 = (uint16_t*)(row); - uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize; - uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2; - uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3; + uint16_t* temp_u_2 = (uint16_t*)(row) + row_size; + uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3; - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; for (y = 0; y < height - 2; y += 2) { - Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width); I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -5894,8 +6598,8 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y, } if (!(height & 1)) { - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); I410ToARGBRow(src_y, temp_u_1, temp_v_1, dst_argb, yuvconstants, width); } @@ -5919,8 +6623,9 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I410ToARGBRow_C; - void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, - int dst_width) = ScaleRowUp2_Linear_16_Any_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -5949,29 +6654,29 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y, #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; } #endif // alloc 2 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); uint16_t* temp_u = (uint16_t*)(row); - uint16_t* temp_v = (uint16_t*)(row) + kRowSize; + uint16_t* temp_v = (uint16_t*)(row) + row_size; for (y = 0; y < height; ++y) { - ScaleRowUp(src_u, temp_u, width); - ScaleRowUp(src_v, temp_v, width); + ScaleRowUp2_Linear_12(src_u, temp_u, width); + ScaleRowUp2_Linear_12(src_v, temp_v, width); I410ToARGBRow(src_y, temp_u, temp_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -6006,9 +6711,12 @@ static int I420AlphaToARGBMatrixBilinear( int width) = I444AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; - void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_Any_C; + void (*Scale2RowUp_Bilinear)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, + int dst_width) = ScaleRowUp2_Bilinear_Any_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -6059,6 +6767,11 @@ static int I420AlphaToARGBMatrixBilinear( } } #endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -6091,40 +6804,50 @@ static int I420AlphaToARGBMatrixBilinear( } } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + +#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSE2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) +#if defined(HAS_SCALEROWUP2_BILINEAR_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) +#if defined(HAS_SCALEROWUP2_BILINEAR_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_NEON) +#if defined(HAS_SCALEROWUP2_BILINEAR_NEON) if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; + Scale2RowUp_Bilinear = ScaleRowUp2_Bilinear_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif // alloc 4 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 4); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4); uint8_t* temp_u_1 = row; - uint8_t* temp_u_2 = row + kRowSize; - uint8_t* temp_v_1 = row + kRowSize * 2; - uint8_t* temp_v_2 = row + kRowSize * 3; + uint8_t* temp_u_2 = row + row_size; + uint8_t* temp_v_1 = row + row_size * 2; + uint8_t* temp_v_2 = row + row_size * 3; - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, width); if (attenuate) { @@ -6135,8 +6858,8 @@ static int I420AlphaToARGBMatrixBilinear( src_a += src_stride_a; for (y = 0; y < height - 2; y += 2) { - Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + Scale2RowUp_Bilinear(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear(src_v, src_stride_v, temp_v_1, row_size, width); I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, width); if (attenuate) { @@ -6158,8 +6881,8 @@ static int I420AlphaToARGBMatrixBilinear( } if (!(height & 1)) { - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear(src_u, temp_u_1, width); + ScaleRowUp2_Linear(src_v, temp_v_1, width); I444AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, width); if (attenuate) { @@ -6193,8 +6916,9 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, int width) = I444AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; - void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = - ScaleRowUp2_Linear_Any_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -6245,6 +6969,11 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, } } #endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -6277,36 +7006,42 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + #if defined(HAS_SCALEROWUP2_LINEAR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; } #endif // alloc 2 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2); uint8_t* temp_u = row; - uint8_t* temp_v = row + kRowSize; + uint8_t* temp_v = row + row_size; for (y = 0; y < height; ++y) { - ScaleRowUp(src_u, temp_u, width); - ScaleRowUp(src_v, temp_v, width); + ScaleRowUp2_Linear(src_u, temp_u, width); + ScaleRowUp2_Linear(src_v, temp_v, width); I444AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { @@ -6346,9 +7081,12 @@ static int I010AlphaToARGBMatrixBilinear( int width) = I410AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; - void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_16_Any_C; + void (*Scale2RowUp_Bilinear_12)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; + void (*ScaleRowUp2_Linear_12)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -6407,35 +7145,43 @@ static int I010AlphaToARGBMatrixBilinear( } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_SSSE3; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_AVX2; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_AVX2; } #endif -#if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) +#if defined(HAS_SCALEROWUP2_BILINEAR_12_NEON) if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + Scale2RowUp_Bilinear_12 = ScaleRowUp2_Bilinear_12_Any_NEON; + ScaleRowUp2_Linear_12 = ScaleRowUp2_Linear_12_Any_NEON; } #endif // alloc 4 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 4 * sizeof(uint16_t)); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 4 * sizeof(uint16_t)); uint16_t* temp_u_1 = (uint16_t*)(row); - uint16_t* temp_u_2 = (uint16_t*)(row) + kRowSize; - uint16_t* temp_v_1 = (uint16_t*)(row) + kRowSize * 2; - uint16_t* temp_v_2 = (uint16_t*)(row) + kRowSize * 3; + uint16_t* temp_u_2 = (uint16_t*)(row) + row_size; + uint16_t* temp_v_1 = (uint16_t*)(row) + row_size * 2; + uint16_t* temp_v_2 = (uint16_t*)(row) + row_size * 3; - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, width); if (attenuate) { @@ -6446,8 +7192,8 @@ static int I010AlphaToARGBMatrixBilinear( src_a += src_stride_a; for (y = 0; y < height - 2; y += 2) { - Scale2RowUp(src_u, src_stride_u, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, src_stride_v, temp_v_1, kRowSize, width); + Scale2RowUp_Bilinear_12(src_u, src_stride_u, temp_u_1, row_size, width); + Scale2RowUp_Bilinear_12(src_v, src_stride_v, temp_v_1, row_size, width); I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, width); if (attenuate) { @@ -6469,8 +7215,8 @@ static int I010AlphaToARGBMatrixBilinear( } if (!(height & 1)) { - Scale2RowUp(src_u, 0, temp_u_1, kRowSize, width); - Scale2RowUp(src_v, 0, temp_v_1, kRowSize, width); + ScaleRowUp2_Linear_12(src_u, temp_u_1, width); + ScaleRowUp2_Linear_12(src_v, temp_v_1, width); I410AlphaToARGBRow(src_y, temp_u_1, temp_v_1, src_a, dst_argb, yuvconstants, width); if (attenuate) { @@ -6504,8 +7250,9 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, int width) = I410AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; - void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, - int dst_width) = ScaleRowUp2_Linear_16_Any_C; + void (*ScaleRowUp2_Linear)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_u || !src_v || !src_a || !dst_argb || width <= 0 || height == 0) { return -1; @@ -6564,32 +7311,37 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_AVX2; } #endif #if defined(HAS_SCALEROWUP2_LINEAR_12_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + ScaleRowUp2_Linear = ScaleRowUp2_Linear_12_Any_NEON; } #endif // alloc 2 lines temp - const int kRowSize = (width + 31) & ~31; - align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); uint16_t* temp_u = (uint16_t*)(row); - uint16_t* temp_v = (uint16_t*)(row) + kRowSize; + uint16_t* temp_v = (uint16_t*)(row) + row_size; for (y = 0; y < height; ++y) { - ScaleRowUp(src_u, temp_u, width); - ScaleRowUp(src_v, temp_v, width); + ScaleRowUp2_Linear(src_u, temp_u, width); + ScaleRowUp2_Linear(src_v, temp_v, width); I410AlphaToARGBRow(src_y, temp_u, temp_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { @@ -6618,9 +7370,10 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y, void (*P410ToARGBRow)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C; - void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleUVRowUp2_Bilinear_16_Any_C; + void (*Scale2RowUp_Bilinear_16)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -6649,35 +7402,35 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y, #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2; } #endif #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON; } #endif // alloc 2 lines temp - const int kRowSize = (2 * width + 31) & ~31; - align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + const int row_size = (2 * width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); uint16_t* temp_uv_1 = (uint16_t*)(row); - uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize; + uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size; - Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width); + Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width); P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; for (y = 0; y < height - 2; y += 2) { - Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width); + Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width); P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -6688,7 +7441,7 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y, } if (!(height & 1)) { - Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width); + Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width); P410ToARGBRow(src_y, temp_uv_1, dst_argb, yuvconstants, width); } @@ -6709,8 +7462,9 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y, void (*P410ToARGBRow)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P410ToARGBRow_C; - void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = - ScaleUVRowUp2_Linear_16_Any_C; + void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv, + int dst_width) = ScaleUVRowUp2_Linear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -6739,28 +7493,28 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y, #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2; } #endif #ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON; } #endif - const int kRowSize = (2 * width + 31) & ~31; - align_buffer_64(row, kRowSize * sizeof(uint16_t)); + const int row_size = (2 * width + 31) & ~31; + align_buffer_64(row, row_size * sizeof(uint16_t)); uint16_t* temp_uv = (uint16_t*)(row); for (y = 0; y < height; ++y) { - ScaleRowUp(src_uv, temp_uv, width); + ScaleRowUp2_Linear(src_uv, temp_uv, width); P410ToARGBRow(src_y, temp_uv, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -6784,9 +7538,10 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y, void (*P410ToAR30Row)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C; - void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleUVRowUp2_Bilinear_16_Any_C; + void (*Scale2RowUp_Bilinear_16)( + const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, + ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -6815,35 +7570,35 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y, #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_AVX2; } #endif #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { - Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; + Scale2RowUp_Bilinear_16 = ScaleUVRowUp2_Bilinear_16_Any_NEON; } #endif // alloc 2 lines temp - const int kRowSize = (2 * width + 31) & ~31; - align_buffer_64(row, kRowSize * 2 * sizeof(uint16_t)); + const int row_size = (2 * width + 31) & ~31; + align_buffer_64(row, row_size * 2 * sizeof(uint16_t)); uint16_t* temp_uv_1 = (uint16_t*)(row); - uint16_t* temp_uv_2 = (uint16_t*)(row) + kRowSize; + uint16_t* temp_uv_2 = (uint16_t*)(row) + row_size; - Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width); + Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width); P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; for (y = 0; y < height - 2; y += 2) { - Scale2RowUp(src_uv, src_stride_uv, temp_uv_1, kRowSize, width); + Scale2RowUp_Bilinear_16(src_uv, src_stride_uv, temp_uv_1, row_size, width); P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; @@ -6854,7 +7609,7 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y, } if (!(height & 1)) { - Scale2RowUp(src_uv, 0, temp_uv_1, kRowSize, width); + Scale2RowUp_Bilinear_16(src_uv, 0, temp_uv_1, row_size, width); P410ToAR30Row(src_y, temp_uv_1, dst_ar30, yuvconstants, width); } @@ -6875,8 +7630,9 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y, void (*P410ToAR30Row)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P410ToAR30Row_C; - void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = - ScaleUVRowUp2_Linear_16_Any_C; + void (*ScaleRowUp2_Linear)(const uint16_t* src_uv, uint16_t* dst_uv, + int dst_width) = ScaleUVRowUp2_Linear_16_Any_C; + assert(yuvconstants); if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -6905,28 +7661,28 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y, #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_AVX2; } #endif #ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; + ScaleRowUp2_Linear = ScaleUVRowUp2_Linear_16_Any_NEON; } #endif - const int kRowSize = (2 * width + 31) & ~31; - align_buffer_64(row, kRowSize * sizeof(uint16_t)); + const int row_size = (2 * width + 31) & ~31; + align_buffer_64(row, row_size * sizeof(uint16_t)); uint16_t* temp_uv = (uint16_t*)(row); for (y = 0; y < height; ++y) { - ScaleRowUp(src_uv, temp_uv, width); + ScaleRowUp2_Linear(src_uv, temp_uv, width); P410ToAR30Row(src_y, temp_uv, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; @@ -6937,6 +7693,133 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y, return 0; } +static int I422ToRGB24MatrixLinear(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I444ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToRGB24Row_C; + void (*ScaleRowUp2_Linear)(const uint8_t* src_ptr, uint8_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_Any_C; + assert(yuvconstants); + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_I444TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToRGB24Row = I444ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + I444ToRGB24Row = I444ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToRGB24Row = I444ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I444ToRGB24Row = I444ToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToRGB24Row = I444ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToRGB24Row = I444ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_I444TORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444ToRGB24Row = I444ToRGB24Row_RVV; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSSE3; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_AVX2; + } +#endif +#if defined(HAS_SCALEROWUP2_LINEAR_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_NEON; + } +#endif + + // alloc 2 lines temp + const int row_size = (width + 31) & ~31; + align_buffer_64(row, row_size * 2); + uint8_t* temp_u = row; + uint8_t* temp_v = row + row_size; + + for (y = 0; y < height; ++y) { + ScaleRowUp2_Linear(src_u, temp_u, width); + ScaleRowUp2_Linear(src_v, temp_v, width); + I444ToRGB24Row(src_y, temp_u, temp_v, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + + free_aligned_buffer_64(row); + return 0; +} + +LIBYUV_API +int I422ToRGB24MatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I422ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + yuvconstants, width, height); + case kFilterBilinear: + case kFilterBox: + case kFilterLinear: + return I422ToRGB24MatrixLinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_rgb24, dst_stride_rgb24, yuvconstants, width, height); + } + + return -1; +} + LIBYUV_API int I420ToARGBMatrixFilter(const uint8_t* src_y, int src_stride_y, @@ -6997,6 +7880,35 @@ int I422ToARGBMatrixFilter(const uint8_t* src_y, return -1; } +LIBYUV_API +int I420ToRGB24MatrixFilter(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height, + enum FilterMode filter) { + switch (filter) { + case kFilterNone: + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + yuvconstants, width, height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 + case kFilterBilinear: + case kFilterBox: + return I420ToRGB24MatrixBilinear( + src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + dst_rgb24, dst_stride_rgb24, yuvconstants, width, height); + } + + return -1; +} + LIBYUV_API int I010ToAR30MatrixFilter(const uint16_t* src_y, int src_stride_y, @@ -7015,13 +7927,12 @@ int I010ToAR30MatrixFilter(const uint16_t* src_y, return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, yuvconstants, width, height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 case kFilterBilinear: case kFilterBox: return I010ToAR30MatrixBilinear( src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, yuvconstants, width, height); - case kFilterLinear: - return -1; } return -1; @@ -7074,13 +7985,12 @@ int I010ToARGBMatrixFilter(const uint16_t* src_y, return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, yuvconstants, width, height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 case kFilterBilinear: case kFilterBox: return I010ToARGBMatrixBilinear( src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, yuvconstants, width, height); - case kFilterLinear: - return -1; } return -1; @@ -7137,14 +8047,13 @@ int I420AlphaToARGBMatrixFilter(const uint8_t* src_y, src_v, src_stride_v, src_a, src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, attenuate); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 case kFilterBilinear: case kFilterBox: return I420AlphaToARGBMatrixBilinear( src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, attenuate); - case kFilterLinear: - return -1; } return -1; @@ -7206,14 +8115,13 @@ int I010AlphaToARGBMatrixFilter(const uint16_t* src_y, src_v, src_stride_v, src_a, src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, attenuate); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 case kFilterBilinear: case kFilterBox: return I010AlphaToARGBMatrixBilinear( src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, src_stride_a, dst_argb, dst_stride_argb, yuvconstants, width, height, attenuate); - case kFilterLinear: - return -1; } return -1; @@ -7253,6 +8161,8 @@ int I210AlphaToARGBMatrixFilter(const uint16_t* src_y, return -1; } +// TODO(fb): Verify this function works correctly. P010 is like NV12 but 10 bit +// UV is biplanar. LIBYUV_API int P010ToARGBMatrixFilter(const uint16_t* src_y, int src_stride_y, @@ -7269,13 +8179,12 @@ int P010ToARGBMatrixFilter(const uint16_t* src_y, return P010ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, dst_stride_argb, yuvconstants, width, height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 case kFilterBilinear: case kFilterBox: return P010ToARGBMatrixBilinear(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, dst_stride_argb, yuvconstants, width, height); - case kFilterLinear: - return -1; } return -1; @@ -7324,13 +8233,12 @@ int P010ToAR30MatrixFilter(const uint16_t* src_y, return P010ToAR30Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_ar30, dst_stride_ar30, yuvconstants, width, height); + case kFilterLinear: // TODO(fb): Implement Linear using Bilinear stride 0 case kFilterBilinear: case kFilterBox: return P010ToAR30MatrixBilinear(src_y, src_stride_y, src_uv, src_stride_uv, dst_ar30, dst_stride_ar30, yuvconstants, width, height); - case kFilterLinear: - return -1; } return -1; diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc index 8bd07e4c..4102d610 100644 --- a/files/source/convert_from.cc +++ b/files/source/convert_from.cc @@ -446,6 +446,14 @@ int I420ToYUY2(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_LSX; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToYUY2Row = I422ToYUY2Row_Any_LASX; @@ -533,6 +541,14 @@ int I422ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif #if defined(HAS_I422TOUYVYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToUYVYRow = I422ToUYVYRow_Any_LASX; @@ -608,6 +624,14 @@ int I420ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif #if defined(HAS_I422TOUYVYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToUYVYRow = I422ToUYVYRow_Any_LASX; diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc index e50c2af3..c3d037c4 100644 --- a/files/source/convert_from_argb.cc +++ b/files/source/convert_from_argb.cc @@ -76,6 +76,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToUV444Row = ARGBToUV444Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_LSX; + } + } +#endif #if defined(HAS_ARGBTOUV444ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToUV444Row = ARGBToUV444Row_Any_LASX; @@ -116,6 +124,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -124,6 +140,11 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -230,7 +251,24 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif - +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -241,6 +279,11 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); @@ -340,6 +383,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -350,6 +401,11 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -361,11 +417,19 @@ int ARGBToNV12(const uint8_t* src_argb, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -389,6 +453,11 @@ int ARGBToNV12(const uint8_t* src_argb, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -502,6 +571,24 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -512,6 +599,11 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -523,11 +615,19 @@ int ARGBToNV21(const uint8_t* src_argb, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -551,6 +651,11 @@ int ARGBToNV21(const uint8_t* src_argb, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -663,6 +768,27 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYRow = ABGRToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYRow = ABGRToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -674,11 +800,19 @@ int ABGRToNV12(const uint8_t* src_abgr, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -702,6 +836,11 @@ int ABGRToNV12(const uint8_t* src_abgr, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -815,6 +954,27 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYRow = ABGRToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYRow = ABGRToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYRow = ABGRToYRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYRow = ABGRToYRow_RVV; + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -826,11 +986,19 @@ int ABGRToNV21(const uint8_t* src_abgr, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -854,6 +1022,11 @@ int ABGRToNV21(const uint8_t* src_abgr, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -972,6 +1145,24 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -982,6 +1173,11 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -1014,6 +1210,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToYUY2Row = I422ToYUY2Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_LSX; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToYUY2Row = I422ToYUY2Row_Any_LASX; @@ -1135,6 +1339,24 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) && defined(HAS_ARGBTOUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + ARGBToUVRow = ARGBToUVRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + ARGBToUVRow = ARGBToUVRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) && defined(HAS_ARGBTOUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -1145,6 +1367,11 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -1177,6 +1404,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToUYVYRow = I422ToUYVYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_LSX; + } + } +#endif #if defined(HAS_I422TOUYVYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToUYVYRow = I422ToUYVYRow_Any_LASX; @@ -1262,6 +1497,14 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif #if defined(HAS_ARGBTOYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYRow = ARGBToYRow_Any_LASX; @@ -1270,6 +1513,11 @@ int ARGBToI400(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -1360,6 +1608,14 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_LSX; + } + } +#endif #if defined(HAS_ARGBTORGB24ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_LASX; @@ -1368,6 +1624,11 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToRGB24Row = ARGBToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -1434,6 +1695,14 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRAWRow = ARGBToRAWRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_LSX; + } + } +#endif #if defined(HAS_ARGBTORAWROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRAWRow = ARGBToRAWRow_Any_LASX; @@ -1442,6 +1711,11 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToRAWRow = ARGBToRAWRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); @@ -1467,7 +1741,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, int height) { int y; void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, int width) = + uint32_t dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; @@ -1512,6 +1786,14 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_LSX; + } + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_LASX; @@ -1589,6 +1871,15 @@ int ARGBToRGB565(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_LSX; + } + } +#endif + #if defined(HAS_ARGBTORGB565ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_LASX; @@ -1663,6 +1954,14 @@ int ARGBToARGB1555(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_LSX; + } + } +#endif #if defined(HAS_ARGBTOARGB1555ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_LASX; @@ -1737,6 +2036,14 @@ int ARGBToARGB4444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_LSX; + } + } +#endif #if defined(HAS_ARGBTOARGB4444ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_LASX; @@ -1858,19 +2165,19 @@ int ARGBToJ420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height) { int y; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_uj, uint8_t* dst_vj, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1879,6 +2186,22 @@ int ARGBToJ420(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -1903,19 +2226,11 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; } } #endif @@ -1951,18 +2266,23 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif for (y = 0; y < height - 1; y += 2) { - ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToUVJRow(src_argb, src_stride_argb, dst_uj, dst_vj, width); ARGBToYJRow(src_argb, dst_yj, width); ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); src_argb += src_stride_argb * 2; dst_yj += dst_stride_yj * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; } if (height & 1) { - ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width); ARGBToYJRow(src_argb, dst_yj, width); } return 0; @@ -1974,19 +2294,19 @@ int ARGBToJ422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height) { int y; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_uj, uint8_t* dst_vj, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1997,10 +2317,10 @@ int ARGBToJ422(const uint8_t* src_argb, } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_yj == width && - dst_stride_u * 2 == width && dst_stride_v * 2 == width) { + dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) { width *= height; height = 1; - src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; + src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; } #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -2026,6 +2346,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; @@ -2074,270 +2402,649 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { - ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width); ARGBToYJRow(src_argb, dst_yj, width); src_argb += src_stride_argb; dst_yj += dst_stride_yj; - dst_u += dst_stride_u; - dst_v += dst_stride_v; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; } return 0; } -// Convert ARGB to AR64. +// Convert ARGB to J400. LIBYUV_API -int ARGBToAR64(const uint8_t* src_argb, +int ARGBToJ400(const uint8_t* src_argb, int src_stride_argb, - uint16_t* dst_ar64, - int dst_stride_ar64, + uint8_t* dst_yj, + int dst_stride_yj, int width, int height) { int y; - void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, - int width) = ARGBToAR64Row_C; - if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = + ARGBToYJRow_C; + if (!src_argb || !dst_yj || width <= 0 || height == 0) { return -1; } - // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_yj == width) { width *= height; height = 1; - src_stride_argb = dst_stride_ar64 = 0; + src_stride_argb = dst_stride_yj = 0; } -#if defined(HAS_ARGBTOAR64ROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAR64Row = ARGBToAR64Row_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOAR64ROW_AVX2) +#if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToAR64Row = ARGBToAR64Row_AVX2; + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif -#if defined(HAS_ARGBTOAR64ROW_NEON) +#if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGBToAR64Row = ARGBToAR64Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToAR64Row = ARGBToAR64Row_NEON; + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; } } #endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { - ARGBToAR64Row(src_argb, dst_ar64, width); + ARGBToYJRow(src_argb, dst_yj, width); src_argb += src_stride_argb; - dst_ar64 += dst_stride_ar64; + dst_yj += dst_stride_yj; } return 0; } -// Convert ARGB to AB64. +// Convert RGBA to J400. LIBYUV_API -int ARGBToAB64(const uint8_t* src_argb, - int src_stride_argb, - uint16_t* dst_ab64, - int dst_stride_ab64, +int RGBAToJ400(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_yj, + int dst_stride_yj, int width, int height) { int y; - void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, - int width) = ARGBToAB64Row_C; - if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { + void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) = + RGBAToYJRow_C; + if (!src_rgba || !dst_yj || width <= 0 || height == 0) { return -1; } - // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; + src_rgba = src_rgba + (height - 1) * src_stride_rgba; + src_stride_rgba = -src_stride_rgba; } // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { + if (src_stride_rgba == width * 4 && dst_stride_yj == width) { width *= height; height = 1; - src_stride_argb = dst_stride_ab64 = 0; + src_stride_rgba = dst_stride_yj = 0; } -#if defined(HAS_ARGBTOAB64ROW_SSSE3) +#if defined(HAS_RGBATOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAB64Row = ARGBToAB64Row_SSSE3; + RGBAToYJRow = RGBAToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOAB64ROW_AVX2) +#if defined(HAS_RGBATOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToAB64Row = ARGBToAB64Row_AVX2; + RGBAToYJRow = RGBAToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGBAToYJRow = RGBAToYJRow_AVX2; } } #endif -#if defined(HAS_ARGBTOAB64ROW_NEON) +#if defined(HAS_RGBATOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGBToAB64Row = ARGBToAB64Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToAB64Row = ARGBToAB64Row_NEON; + RGBAToYJRow = RGBAToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_NEON; + } + } +#endif +#if defined(HAS_RGBATOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYJRow = RGBAToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_MSA; + } + } +#endif +#if defined(HAS_RGBATOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGBAToYJRow = RGBAToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGBAToYJRow = RGBAToYJRow_LSX; } } #endif +#if defined(HAS_RGBATOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGBAToYJRow = RGBAToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGBAToYJRow = RGBAToYJRow_LASX; + } + } +#endif +#if defined(HAS_RGBATOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGBAToYJRow = RGBAToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { - ARGBToAB64Row(src_argb, dst_ab64, width); - src_argb += src_stride_argb; - dst_ab64 += dst_stride_ab64; + RGBAToYJRow(src_rgba, dst_yj, width); + src_rgba += src_stride_rgba; + dst_yj += dst_stride_yj; } return 0; } -// Convert ARGB to J400. +// Convert ABGR to J420. (JPeg full range I420). LIBYUV_API -int ARGBToJ400(const uint8_t* src_argb, - int src_stride_argb, +int ABGRToJ420(const uint8_t* src_abgr, + int src_stride_abgr, uint8_t* dst_yj, int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height) { int y; - void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = - ARGBToYJRow_C; - if (!src_argb || !dst_yj || width <= 0 || height == 0) { + void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ABGRToUVJRow_C; + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYJRow = ABGRToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVJRow = ABGRToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYJRow = ABGRToYJRow_Any_MSA; + ABGRToUVJRow = ABGRToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_MSA; + ABGRToUVJRow = ABGRToUVJRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYJRow = ABGRToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYJRow = ABGRToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + ABGRToYJRow(src_abgr + src_stride_abgr, dst_yj + dst_stride_yj, width); + src_abgr += src_stride_abgr * 2; + dst_yj += dst_stride_yj * 2; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; + } + if (height & 1) { + ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + } + return 0; +} + +// Convert ABGR to J422. (JPeg full range I422). +LIBYUV_API +int ABGRToJ422(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height) { + int y; + void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ABGRToUVJRow_C; + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; } // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_yj == width) { + if (src_stride_abgr == width * 4 && dst_stride_yj == width && + dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) { width *= height; height = 1; - src_stride_argb = dst_stride_yj = 0; + src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; } -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if defined(HAS_ABGRTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; + ABGRToYJRow = ABGRToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOYJROW_AVX2) +#if defined(HAS_ABGRTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYJRow = ARGBToYJRow_Any_AVX2; + ABGRToYJRow = ABGRToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToYJRow = ARGBToYJRow_AVX2; + ABGRToYJRow = ABGRToYJRow_AVX2; } } #endif -#if defined(HAS_ARGBTOYJROW_NEON) +#if defined(HAS_ABGRTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYJRow = ARGBToYJRow_Any_NEON; + ABGRToYJRow = ABGRToYJRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_NEON; + ABGRToYJRow = ABGRToYJRow_NEON; } } #endif -#if defined(HAS_ARGBTOYJROW_MSA) +#if defined(HAS_ABGRTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVJRow = ABGRToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { - ARGBToYJRow = ARGBToYJRow_Any_MSA; + ABGRToYJRow = ABGRToYJRow_Any_MSA; + ABGRToUVJRow = ABGRToUVJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_MSA; + ABGRToYJRow = ABGRToYJRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_MSA; } } #endif +#if defined(HAS_ABGRTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYJRow = ABGRToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYJRow = ABGRToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { - ARGBToYJRow(src_argb, dst_yj, width); - src_argb += src_stride_argb; + ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + src_abgr += src_stride_abgr; dst_yj += dst_stride_yj; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; } return 0; } -// Convert RGBA to J400. +// Convert ABGR to J400. LIBYUV_API -int RGBAToJ400(const uint8_t* src_rgba, - int src_stride_rgba, +int ABGRToJ400(const uint8_t* src_abgr, + int src_stride_abgr, uint8_t* dst_yj, int dst_stride_yj, int width, int height) { int y; - void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) = - RGBAToYJRow_C; - if (!src_rgba || !dst_yj || width <= 0 || height == 0) { + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; - src_rgba = src_rgba + (height - 1) * src_stride_rgba; - src_stride_rgba = -src_stride_rgba; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; } // Coalesce rows. - if (src_stride_rgba == width * 4 && dst_stride_yj == width) { + if (src_stride_abgr == width * 4 && dst_stride_yj == width) { width *= height; height = 1; - src_stride_rgba = dst_stride_yj = 0; + src_stride_abgr = dst_stride_yj = 0; } -#if defined(HAS_RGBATOYJROW_SSSE3) +#if defined(HAS_ABGRTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - RGBAToYJRow = RGBAToYJRow_Any_SSSE3; + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToYJRow = RGBAToYJRow_SSSE3; + ABGRToYJRow = ABGRToYJRow_SSSE3; } } #endif -#if defined(HAS_RGBATOYJROW_AVX2) +#if defined(HAS_ABGRTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RGBAToYJRow = RGBAToYJRow_Any_AVX2; + ABGRToYJRow = ABGRToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - RGBAToYJRow = RGBAToYJRow_AVX2; + ABGRToYJRow = ABGRToYJRow_AVX2; } } #endif -#if defined(HAS_RGBATOYJROW_NEON) +#if defined(HAS_ABGRTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGBAToYJRow = RGBAToYJRow_Any_NEON; + ABGRToYJRow = ABGRToYJRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - RGBAToYJRow = RGBAToYJRow_NEON; + ABGRToYJRow = ABGRToYJRow_NEON; } } #endif -#if defined(HAS_RGBATOYJROW_MSA) +#if defined(HAS_ABGRTOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { - RGBAToYJRow = RGBAToYJRow_Any_MSA; + ABGRToYJRow = ABGRToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { - RGBAToYJRow = RGBAToYJRow_MSA; + ABGRToYJRow = ABGRToYJRow_MSA; } } #endif +#if defined(HAS_ABGRTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYJRow = ABGRToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYJRow = ABGRToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_LASX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ABGRToYJRow = ABGRToYJRow_RVV; + } +#endif for (y = 0; y < height; ++y) { - RGBAToYJRow(src_rgba, dst_yj, width); - src_rgba += src_stride_rgba; + ABGRToYJRow(src_abgr, dst_yj, width); + src_abgr += src_stride_abgr; dst_yj += dst_stride_yj; } return 0; } +// Convert ARGB to AR64. +LIBYUV_API +int ARGBToAR64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height) { + int y; + void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAR64Row_C; + if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar64 = 0; + } +#if defined(HAS_ARGBTOAR64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR64Row = ARGBToAR64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAR64Row = ARGBToAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToAR64Row = ARGBToAR64Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAR64Row(src_argb, dst_ar64, width); + src_argb += src_stride_argb; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + +// Convert ARGB to AB64. +LIBYUV_API +int ARGBToAB64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + int y; + void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAB64Row_C; + if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ab64 = 0; + } +#if defined(HAS_ARGBTOAB64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAB64Row = ARGBToAB64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAB64Row = ARGBToAB64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToAB64Row = ARGBToAB64Row_RVV; + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAB64Row(src_argb, dst_ab64, width); + src_argb += src_stride_argb; + dst_ab64 += dst_stride_ab64; + } + return 0; +} + // Enabled if 1 pass is available -#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \ + defined(HAS_RAWTOYJROW_RVV) #define HAS_RAWTOYJROW #endif @@ -2355,7 +3062,7 @@ int RAWToJNV21(const uint8_t* src_raw, int halfwidth = (width + 1) >> 1; #if defined(HAS_RAWTOYJROW) void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_uj, uint8_t* dst_vj, int width) = RAWToUVJRow_C; void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = RAWToYJRow_C; @@ -2363,12 +3070,12 @@ int RAWToJNV21(const uint8_t* src_raw, void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_uj, uint8_t* dst_vj, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYJRow_C; #endif - void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + void (*MergeUVRow_)(const uint8_t* src_uj, const uint8_t* src_vj, uint8_t* dst_vu, int width) = MergeUVRow_C; if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; @@ -2403,6 +3110,27 @@ int RAWToJNV21(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOYJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToYJRow = RAWToYJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToYJRow = RAWToYJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToYJRow = RAWToYJRow_RVV; + } +#endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYJROW @@ -2459,11 +3187,19 @@ int RAWToJNV21(const uint8_t* src_raw, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 32)) { + if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow_ = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 64)) { + MergeUVRow_ = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; @@ -2487,30 +3223,35 @@ int RAWToJNV21(const uint8_t* src_raw, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a row of uv. - align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + align_buffer_64(row_uj, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_vj = row_uj + ((halfwidth + 31) & ~31); #if !defined(HAS_RAWTOYJROW) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); #endif for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RAWTOYJROW) - RAWToUVJRow(src_raw, src_stride_raw, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); RAWToYJRow(src_raw, dst_y, width); RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); #else RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVJRow(row, kRowSize, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width); + ARGBToUVJRow(row, row_size, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); ARGBToYJRow(row, dst_y, width); - ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); + ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width); #endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; @@ -2518,20 +3259,20 @@ int RAWToJNV21(const uint8_t* src_raw, } if (height & 1) { #if defined(HAS_RAWTOYJROW) - RAWToUVJRow(src_raw, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + RAWToUVJRow(src_raw, 0, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); RAWToYJRow(src_raw, dst_y, width); #else RAWToARGBRow(src_raw, row, width); - ARGBToUVJRow(row, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToUVJRow(row, 0, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); ARGBToYJRow(row, dst_y, width); #endif } #if !defined(HAS_RAWTOYJROW) free_aligned_buffer_64(row); #endif - free_aligned_buffer_64(row_u); + free_aligned_buffer_64(row_uj); } return 0; } diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc index 56fe60e4..0c4a1581 100644 --- a/files/source/cpu_id.cc +++ b/files/source/cpu_id.cc @@ -40,7 +40,6 @@ extern "C" { // cpu_info_ variable for SIMD instruction sets detected. LIBYUV_API int cpu_info_ = 0; -// TODO(fbarchard): Consider using int for cpuid so casting is not needed. // Low level cpuid for X86. #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__)) && \ @@ -108,14 +107,14 @@ void CpuId(int eax, int ecx, int* cpu_info) { // } // For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code. // https://code.google.com/p/libyuv/issues/detail?id=529 -#if defined(_M_IX86) && (_MSC_VER < 1900) +#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900) #pragma optimize("g", off) #endif #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. -int GetXCR0() { +static int GetXCR0() { int xcr0 = 0; #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT @@ -129,7 +128,7 @@ int GetXCR0() { #define GetXCR0() 0 #endif // defined(_M_IX86) || defined(_M_X64) .. // Return optimization to previous setting. -#if defined(_M_IX86) && (_MSC_VER < 1900) +#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900) #pragma optimize("g", on) #endif @@ -137,13 +136,14 @@ int GetXCR0() { // For Arm, but public to allow testing on any CPU LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; - FILE* f = fopen(cpuinfo_name, "r"); + FILE* f = fopen(cpuinfo_name, "re"); if (!f) { // Assume Neon if /proc/cpuinfo is unavailable. // This will occur for Chrome sandbox for Pepper or Render process. return kCpuHasNEON; } - while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + memset(cpuinfo_line, 0, sizeof(cpuinfo_line)); + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) { if (memcmp(cpuinfo_line, "Features", 8) == 0) { char* p = strstr(cpuinfo_line, " neon"); if (p && (p[5] == ' ' || p[5] == '\n')) { @@ -162,17 +162,90 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { return 0; } -// TODO(fbarchard): Consider read_msa_ir(). +LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) { + char cpuinfo_line[512]; + int flag = 0; + FILE* f = fopen(cpuinfo_name, "re"); + if (!f) { +#if defined(__riscv_vector) + // Assume RVV if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return kCpuHasRVV; +#else + return 0; +#endif + } + memset(cpuinfo_line, 0, sizeof(cpuinfo_line)); + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) { + if (memcmp(cpuinfo_line, "isa", 3) == 0) { + // ISA string must begin with rv64{i,e,g} for a 64-bit processor. + char* isa = strstr(cpuinfo_line, "rv64"); + if (isa) { + size_t isa_len = strlen(isa); + char* extensions; + size_t extensions_len = 0; + size_t std_isa_len; + // Remove the new-line character at the end of string + if (isa[isa_len - 1] == '\n') { + isa[--isa_len] = '\0'; + } + // 5 ISA characters + if (isa_len < 5) { + fclose(f); + return 0; + } + // Skip {i,e,g} canonical checking. + // Skip rvxxx + isa += 5; + // Find the very first occurrence of 's', 'x' or 'z'. + // To detect multi-letter standard, non-standard, and + // supervisor-level extensions. + extensions = strpbrk(isa, "zxs"); + if (extensions) { + // Multi-letter extensions are seperated by a single underscore + // as described in RISC-V User-Level ISA V2.2. + char* ext = strtok(extensions, "_"); + extensions_len = strlen(extensions); + while (ext) { + // Search for the ZVFH (Vector FP16) extension. + if (!strcmp(ext, "zvfh")) { + flag |= kCpuHasRVVZVFH; + } + ext = strtok(NULL, "_"); + } + } + std_isa_len = isa_len - extensions_len - 5; + // Detect the v in the standard single-letter extensions. + if (memchr(isa, 'v', std_isa_len)) { + // The RVV implied the F extension. + flag |= kCpuHasRVV; + } + } + } +#if defined(__riscv_vector) + // Assume RVV if /proc/cpuinfo is from x86 host running QEMU. + else if ((memcmp(cpuinfo_line, "vendor_id\t: GenuineIntel", 24) == 0) || + (memcmp(cpuinfo_line, "vendor_id\t: AuthenticAMD", 24) == 0)) { + fclose(f); + return kCpuHasRVV; + } +#endif + } + fclose(f); + return flag; +} + LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; - int flag = 0x0; - FILE* f = fopen(cpuinfo_name, "r"); + int flag = 0; + FILE* f = fopen(cpuinfo_name, "re"); if (!f) { // Assume nothing if /proc/cpuinfo is unavailable. // This will occur for Chrome sandbox for Pepper or Render process. return 0; } - while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + memset(cpuinfo_line, 0, sizeof(cpuinfo_line)); + while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) { if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { // Workaround early kernel without MSA in ASEs line. if (strstr(cpuinfo_line, "Loongson-2K")) { @@ -191,14 +264,13 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { return flag; } -// TODO(fbarchard): Consider read_loongarch_ir(). #define LOONGARCH_CFG2 0x2 #define LOONGARCH_CFG2_LSX (1 << 6) #define LOONGARCH_CFG2_LASX (1 << 7) #if defined(__loongarch__) LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) { - int flag = 0x0; + int flag = 0; uint32_t cfg2 = 0; __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2)); @@ -277,6 +349,10 @@ static SAFEBUFFERS int GetCpuFlags(void) { #endif cpu_info |= kCpuHasARM; #endif // __arm__ +#if defined(__riscv) && defined(__linux__) + cpu_info = RiscvCpuCaps("/proc/cpuinfo"); + cpu_info |= kCpuHasRISCV; +#endif // __riscv cpu_info |= kCpuInitialized; return cpu_info; } diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc index 4ccf00a3..0141da8a 100644 --- a/files/source/mjpeg_decoder.cc +++ b/files/source/mjpeg_decoder.cc @@ -109,7 +109,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { } buf_.data = src; - buf_.len = static_cast(src_len); + buf_.len = (int)src_len; buf_vec_.pos = 0; decompress_struct_->client_data = &buf_vec_; #ifdef HAVE_SETJMP @@ -428,7 +428,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) { void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT jpeg_source_mgr* src = cinfo->src; - size_t bytes = static_cast(num_bytes); + size_t bytes = (size_t)num_bytes; if (bytes > src->bytes_in_buffer) { src->next_input_byte = nullptr; src->bytes_in_buffer = 0; diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc index 169d4a8f..d115a2a1 100644 --- a/files/source/planar_functions.cc +++ b/files/source/planar_functions.cc @@ -75,6 +75,11 @@ void CopyPlane(const uint8_t* src_y, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Copy plane for (y = 0; y < height; ++y) { @@ -162,7 +167,7 @@ void Convert8To16Plane(const uint8_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, - int scale, // 16384 for 10 bits + int scale, // 1024 for 10 bits int width, int height) { int y; @@ -333,6 +338,45 @@ int I210Copy(const uint16_t* src_y, return 0; } +// Copy I410. +LIBYUV_API +int I410Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; +} + // Copy I400. LIBYUV_API int I400ToI400(const uint8_t* src_y, @@ -385,6 +429,7 @@ int I420ToI400(const uint8_t* src_y, } // Copy NV12. Supports inverting. +LIBYUV_API int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, @@ -418,6 +463,7 @@ int NV12Copy(const uint8_t* src_y, } // Copy NV21. Supports inverting. +LIBYUV_API int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, @@ -504,6 +550,11 @@ void SplitUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_SPLITUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitUVRow = SplitUVRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Copy a row of UV. @@ -553,11 +604,19 @@ void MergeUVPlane(const uint8_t* src_u, #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { + if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_AVX2; } } #endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + MergeUVRow = MergeUVRow_AVX512BW; + } + } +#endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow = MergeUVRow_Any_NEON; @@ -582,6 +641,11 @@ void MergeUVPlane(const uint8_t* src_u, } } #endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. @@ -687,7 +751,7 @@ void MergeUVPlane_16(const uint16_t* src_u, #if defined(HAS_MERGEUVROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_16 = MergeUVRow_16_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { MergeUVRow_16 = MergeUVRow_16_AVX2; } } @@ -911,31 +975,31 @@ int NV21ToNV12(const uint8_t* src_y, return 0; } +// Test if tile_height is a power of 2 (16 or 32) +#define IS_POWEROFTWO(x) (!((x) & ((x)-1))) + // Detile a plane of data // tile width is 16 and assumed. // tile_height is 16 or 32 for MM21. // src_stride_y is bytes per row of source ignoring tiling. e.g. 640 // TODO: More detile row functions. - LIBYUV_API -void DetilePlane(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_y, - int dst_stride_y, - int width, - int height, - int tile_height) { +int DetilePlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + int tile_height) { const ptrdiff_t src_tile_stride = 16 * tile_height; int y; void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width) = DetileRow_C; - assert(src_stride_y >= 0); - assert(tile_height > 0); - assert(src_stride_y > 0); - - if (width <= 0 || height == 0) { - return; + if (!src_y || !dst_y || width <= 0 || height == 0 || + !IS_POWEROFTWO(tile_height)) { + return -1; } + // Negative height means invert the image. if (height < 0) { height = -height; @@ -970,6 +1034,72 @@ void DetilePlane(const uint8_t* src_y, src_y = src_y - src_tile_stride + src_stride_y * tile_height; } } + return 0; +} + +// Convert a plane of 16 bit tiles of 16 x H to linear. +// tile width is 16 and assumed. +// tile_height is 16 or 32 for MT2T. +LIBYUV_API +int DetilePlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height, + int tile_height) { + const ptrdiff_t src_tile_stride = 16 * tile_height; + int y; + void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride, + uint16_t* dst, int width) = DetileRow_16_C; + if (!src_y || !dst_y || width <= 0 || height == 0 || + !IS_POWEROFTWO(tile_height)) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + +#if defined(HAS_DETILEROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileRow_16 = DetileRow_16_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileRow_16 = DetileRow_16_SSE2; + } + } +#endif +#if defined(HAS_DETILEROW_16_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + DetileRow_16 = DetileRow_16_Any_AVX; + if (IS_ALIGNED(width, 16)) { + DetileRow_16 = DetileRow_16_AVX; + } + } +#endif +#if defined(HAS_DETILEROW_16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileRow_16 = DetileRow_16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileRow_16 = DetileRow_16_NEON; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileRow_16(src_y, src_tile_stride, dst_y, width); + dst_y += dst_stride_y; + src_y += 16; + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_y = src_y - src_tile_stride + src_stride_y * tile_height; + } + } + return 0; } LIBYUV_API @@ -1033,6 +1163,74 @@ void DetileSplitUVPlane(const uint8_t* src_uv, } } +LIBYUV_API +void DetileToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height, + int tile_height) { + const ptrdiff_t src_y_tile_stride = 16 * tile_height; + const ptrdiff_t src_uv_tile_stride = src_y_tile_stride / 2; + int y; + void (*DetileToYUY2)(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, int width) = DetileToYUY2_C; + assert(src_stride_y >= 0); + assert(src_stride_y > 0); + assert(src_stride_uv >= 0); + assert(src_stride_uv > 0); + assert(tile_height > 0); + + if (width <= 0 || height == 0 || tile_height <= 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + +#if defined(HAS_DETILETOYUY2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileToYUY2 = DetileToYUY2_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileToYUY2 = DetileToYUY2_NEON; + } + } +#endif + +#if defined(HAS_DETILETOYUY2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileToYUY2 = DetileToYUY2_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileToYUY2 = DetileToYUY2_SSE2; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, + width); + dst_yuy2 += dst_stride_yuy2; + src_y += 16; + + if (y & 0x1) + src_uv += 16; + + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_y = src_y - src_y_tile_stride + src_stride_y * tile_height; + src_uv = src_uv - src_uv_tile_stride + src_stride_uv * (tile_height / 2); + } + } +} + // Support function for NV12 etc RGB channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API @@ -1085,6 +1283,11 @@ void SplitRGBPlane(const uint8_t* src_rgb, } } #endif +#if defined(HAS_SPLITRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitRGBRow = SplitRGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Copy a row of RGB. @@ -1144,6 +1347,11 @@ void MergeRGBPlane(const uint8_t* src_r, } } #endif +#if defined(HAS_MERGERGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeRGBRow = MergeRGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of RGB. @@ -1156,18 +1364,18 @@ void MergeRGBPlane(const uint8_t* src_r, } LIBYUV_NOINLINE -void SplitARGBPlaneAlpha(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_r, - int dst_stride_r, - uint8_t* dst_g, - int dst_stride_g, - uint8_t* dst_b, - int dst_stride_b, - uint8_t* dst_a, - int dst_stride_a, - int width, - int height) { +static void SplitARGBPlaneAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { int y; void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width) = @@ -1175,6 +1383,9 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb, assert(height > 0); + if (width <= 0 || height == 0) { + return; + } if (src_stride_argb == width * 4 && dst_stride_r == width && dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) { width *= height; @@ -1215,6 +1426,11 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_SPLITARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitARGBRow = SplitARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); @@ -1227,21 +1443,24 @@ void SplitARGBPlaneAlpha(const uint8_t* src_argb, } LIBYUV_NOINLINE -void SplitARGBPlaneOpaque(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_r, - int dst_stride_r, - uint8_t* dst_g, - int dst_stride_g, - uint8_t* dst_b, - int dst_stride_b, - int width, - int height) { +static void SplitARGBPlaneOpaque(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int y; void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) = SplitXRGBRow_C; assert(height > 0); + if (width <= 0 || height == 0) { + return; + } if (src_stride_argb == width * 4 && dst_stride_r == width && dst_stride_g == width && dst_stride_b == width) { width *= height; @@ -1281,6 +1500,11 @@ void SplitARGBPlaneOpaque(const uint8_t* src_argb, } } #endif +#if defined(HAS_SPLITXRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitXRGBRow = SplitXRGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); @@ -1328,18 +1552,18 @@ void SplitARGBPlane(const uint8_t* src_argb, } LIBYUV_NOINLINE -void MergeARGBPlaneAlpha(const uint8_t* src_r, - int src_stride_r, - const uint8_t* src_g, - int src_stride_g, - const uint8_t* src_b, - int src_stride_b, - const uint8_t* src_a, - int src_stride_a, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static void MergeARGBPlaneAlpha(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, @@ -1347,6 +1571,9 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r, assert(height > 0); + if (width <= 0 || height == 0) { + return; + } if (src_stride_r == width && src_stride_g == width && src_stride_b == width && src_stride_a == width && dst_stride_argb == width * 4) { width *= height; @@ -1378,6 +1605,11 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r, } } #endif +#if defined(HAS_MERGEARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeARGBRow = MergeARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); @@ -1390,16 +1622,16 @@ void MergeARGBPlaneAlpha(const uint8_t* src_r, } LIBYUV_NOINLINE -void MergeARGBPlaneOpaque(const uint8_t* src_r, - int src_stride_r, - const uint8_t* src_g, - int src_stride_g, - const uint8_t* src_b, - int src_stride_b, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +static void MergeARGBPlaneOpaque(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width) = @@ -1407,6 +1639,9 @@ void MergeARGBPlaneOpaque(const uint8_t* src_r, assert(height > 0); + if (width <= 0 || height == 0) { + return; + } if (src_stride_r == width && src_stride_g == width && src_stride_b == width && dst_stride_argb == width * 4) { width *= height; @@ -1437,6 +1672,11 @@ void MergeARGBPlaneOpaque(const uint8_t* src_r, } } #endif +#if defined(HAS_MERGEXRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeXRGBRow = MergeXRGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); @@ -1888,6 +2128,16 @@ int YUY2ToI422(const uint8_t* src_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + YUY2ToYRow = YUY2ToYRow_Any_LSX; + YUY2ToUV422Row = YUY2ToUV422Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_LSX; + YUY2ToUV422Row = YUY2ToUV422Row_LSX; + } + } +#endif #if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { YUY2ToYRow = YUY2ToYRow_Any_LASX; @@ -1984,6 +2234,16 @@ int UYVYToI422(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_LSX) && defined(HAS_UYVYTOUV422ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + UYVYToUV422Row = UYVYToUV422Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + UYVYToUV422Row = UYVYToUV422Row_LSX; + } + } +#endif #if defined(HAS_UYVYTOYROW_LASX) && defined(HAS_UYVYTOUV422ROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { UYVYToYRow = UYVYToYRow_Any_LASX; @@ -2131,6 +2391,14 @@ int UYVYToY(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + UYVYToYRow = UYVYToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_LSX; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToYRow(src_uyvy, dst_y, width); @@ -2189,6 +2457,14 @@ void MirrorPlane(const uint8_t* src_y, } } #endif +#if defined(HAS_MIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorRow = MirrorRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_LSX; + } + } +#endif #if defined(HAS_MIRRORROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { MirrorRow = MirrorRow_Any_LASX; @@ -2255,6 +2531,14 @@ void MirrorUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_MIRRORUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorUVRow = MirrorUVRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + MirrorUVRow = MirrorUVRow_LSX; + } + } +#endif #if defined(HAS_MIRRORUVROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { MirrorUVRow = MirrorUVRow_Any_LASX; @@ -2427,6 +2711,14 @@ int ARGBMirror(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_LSX; + } + } +#endif #if defined(HAS_ARGBMIRRORROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBMirrorRow = ARGBMirrorRow_Any_LASX; @@ -2809,6 +3101,14 @@ int ARGBMultiply(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_LSX; + } + } +#endif #if defined(HAS_ARGBMULTIPLYROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_LASX; @@ -2894,6 +3194,14 @@ int ARGBAdd(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBADDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAddRow = ARGBAddRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBAddRow = ARGBAddRow_LSX; + } + } +#endif #if defined(HAS_ARGBADDROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBAddRow = ARGBAddRow_Any_LASX; @@ -2974,6 +3282,14 @@ int ARGBSubtract(const uint8_t* src_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBSubtractRow = ARGBSubtractRow_Any_LSX; + if (IS_ALIGNED(width, 4)) { + ARGBSubtractRow = ARGBSubtractRow_LSX; + } + } +#endif #if defined(HAS_ARGBSUBTRACTROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBSubtractRow = ARGBSubtractRow_Any_LASX; @@ -3051,6 +3367,11 @@ int RAWToRGB24(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToRGB24Row = RAWToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { RAWToRGB24Row(src_raw, dst_rgb24, width); @@ -3060,6 +3381,7 @@ int RAWToRGB24(const uint8_t* src_raw, return 0; } +// TODO(fbarchard): Consider uint8_t value LIBYUV_API void SetPlane(uint8_t* dst_y, int dst_stride_y, @@ -3067,7 +3389,7 @@ void SetPlane(uint8_t* dst_y, int height, uint32_t value) { int y; - void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C; + void (*SetRow)(uint8_t* dst, uint8_t value, int width) = SetRow_C; if (width <= 0 || height == 0) { return; @@ -3120,7 +3442,7 @@ void SetPlane(uint8_t* dst_y, // Set plane for (y = 0; y < height; ++y) { - SetRow(dst_y, value, width); + SetRow(dst_y, (uint8_t)value, width); dst_y += dst_stride_y; } } @@ -3168,7 +3490,7 @@ int ARGBRect(uint8_t* dst_argb, int height, uint32_t value) { int y; - void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) = + void (*ARGBSetRow)(uint8_t* dst_argb, uint32_t value, int width) = ARGBSetRow_C; if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { return -1; @@ -3293,6 +3615,14 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif #if defined(HAS_ARGBATTENUATEROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; @@ -3301,6 +3631,11 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); @@ -3401,6 +3736,11 @@ int ARGBGrayTo(const uint8_t* src_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_LSX; + } +#endif #if defined(HAS_ARGBGRAYROW_LASX) if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { ARGBGrayRow = ARGBGrayRow_LASX; @@ -3451,6 +3791,11 @@ int ARGBGray(uint8_t* dst_argb, ARGBGrayRow = ARGBGrayRow_MSA; } #endif +#if defined(HAS_ARGBGRAYROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_LSX; + } +#endif #if defined(HAS_ARGBGRAYROW_LASX) if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { ARGBGrayRow = ARGBGrayRow_LASX; @@ -3473,7 +3818,7 @@ int ARGBSepia(uint8_t* dst_argb, int width, int height) { int y; - void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C; + void (*ARGBSepiaRow)(uint8_t* dst_argb, int width) = ARGBSepiaRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; @@ -3499,6 +3844,11 @@ int ARGBSepia(uint8_t* dst_argb, ARGBSepiaRow = ARGBSepiaRow_MSA; } #endif +#if defined(HAS_ARGBSEPIAROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_LSX; + } +#endif #if defined(HAS_ARGBSEPIAROW_LASX) if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 16)) { ARGBSepiaRow = ARGBSepiaRow_LASX; @@ -3616,7 +3966,7 @@ int ARGBColorTable(uint8_t* dst_argb, int width, int height) { int y; - void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, + void (*ARGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb, int width) = ARGBColorTableRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || @@ -3652,7 +4002,7 @@ int RGBColorTable(uint8_t* dst_argb, int width, int height) { int y; - void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, + void (*RGBColorTableRow)(uint8_t* dst_argb, const uint8_t* table_argb, int width) = RGBColorTableRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || @@ -3697,7 +4047,7 @@ int ARGBQuantize(uint8_t* dst_argb, int width, int height) { int y; - void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size, + void (*ARGBQuantizeRow)(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) = ARGBQuantizeRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || @@ -3924,6 +4274,11 @@ int ARGBShade(const uint8_t* src_argb, ARGBShadeRow = ARGBShadeRow_MSA; } #endif +#if defined(HAS_ARGBSHADEROW_LSX) + if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_LSX; + } +#endif #if defined(HAS_ARGBSHADEROW_LASX) if (TestCpuFlag(kCpuHasLASX) && IS_ALIGNED(width, 8)) { ARGBShadeRow = ARGBShadeRow_LASX; @@ -3950,7 +4305,7 @@ int InterpolatePlane(const uint8_t* src0, int height, int interpolation) { int y; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src0 || !src1 || !dst || width <= 0 || height == 0) { @@ -4008,6 +4363,11 @@ int InterpolatePlane(const uint8_t* src0, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { InterpolateRow(dst, src0, src1 - src0, width, interpolation); @@ -4030,7 +4390,7 @@ int InterpolatePlane_16(const uint16_t* src0, int height, int interpolation) { int y; - void (*InterpolateRow_16)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*InterpolateRow_16)(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; if (!src0 || !src1 || !dst || width <= 0 || height == 0) { @@ -4213,6 +4573,14 @@ int ARGBShuffle(const uint8_t* src_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBShuffleRow = ARGBShuffleRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_LSX; + } + } +#endif #if defined(HAS_ARGBSHUFFLEROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBShuffleRow = ARGBShuffleRow_Any_LASX; @@ -4444,6 +4812,11 @@ static int ARGBSobelize(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYJRow = ARGBToYJRow_RVV; + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -4477,16 +4850,16 @@ static int ARGBSobelize(const uint8_t* src_argb, #endif { // 3 rows with edges before/after. - const int kRowSize = (width + kEdge + 31) & ~31; - align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); + const int row_size = (width + kEdge + 31) & ~31; + align_buffer_64(rows, row_size * 2 + (kEdge + row_size * 3 + kEdge)); uint8_t* row_sobelx = rows; - uint8_t* row_sobely = rows + kRowSize; - uint8_t* row_y = rows + kRowSize * 2; + uint8_t* row_sobely = rows + row_size; + uint8_t* row_y = rows + row_size * 2; // Convert first row. uint8_t* row_y0 = row_y + kEdge; - uint8_t* row_y1 = row_y0 + kRowSize; - uint8_t* row_y2 = row_y1 + kRowSize; + uint8_t* row_y1 = row_y0 + row_size; + uint8_t* row_y2 = row_y1 + row_size; ARGBToYJRow(src_argb, row_y0, width); row_y0[-1] = row_y0[0]; memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. @@ -5027,9 +5400,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, return 0; } -// TODO(fbarchard): Consider if width is even Y channel can be split -// directly. A SplitUVRow_Odd function could copy the remaining chroma. - LIBYUV_API int YUY2ToNV12(const uint8_t* src_yuy2, int src_stride_yuy2, @@ -5040,13 +5410,10 @@ int YUY2ToNV12(const uint8_t* src_yuy2, int width, int height) { int y; - int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, - int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2, + uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C; if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } @@ -5057,109 +5424,91 @@ int YUY2ToNV12(const uint8_t* src_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } -#if defined(HAS_SPLITUVROW_SSE2) +#if defined(HAS_YUY2TOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif -#if defined(HAS_SPLITUVROW_AVX2) +#if defined(HAS_YUY2TOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow = SplitUVRow_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; } } #endif -#if defined(HAS_SPLITUVROW_NEON) +#if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow = SplitUVRow_Any_NEON; + YUY2ToYRow = YUY2ToYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_NEON; + YUY2ToYRow = YUY2ToYRow_NEON; } } #endif -#if defined(HAS_SPLITUVROW_MSA) +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { - SplitUVRow = SplitUVRow_Any_MSA; + YUY2ToYRow = YUY2ToYRow_Any_MSA; if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_MSA; + YUY2ToYRow = YUY2ToYRow_MSA; } } #endif -#if defined(HAS_SPLITUVROW_LSX) +#if defined(HAS_YUY2TOYROW_LSX) && defined(HAS_YUY2TOUV422ROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { - SplitUVRow = SplitUVRow_Any_LSX; - if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_LSX; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; + YUY2ToYRow = YUY2ToYRow_Any_LSX; if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; + YUY2ToYRow = YUY2ToYRow_LSX; } } #endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_AVX2; + YUY2ToYRow = YUY2ToYRow_LASX; } } #endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; + +#if defined(HAS_YUY2TONVUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_NEON; + YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2; } } #endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; +#if defined(HAS_YUY2TONVUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; + YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2; } } #endif -#if defined(HAS_INTERPOLATEROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - InterpolateRow = InterpolateRow_Any_LSX; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_LSX; +#if defined(HAS_YUY2TONVUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_NEON; } } #endif - { - int awidth = halfwidth * 2; - // row of y and 2 rows of uv - align_buffer_64(rows, awidth * 3); - - for (y = 0; y < height - 1; y += 2) { - // Split Y from UV. - SplitUVRow(src_yuy2, rows, rows + awidth, awidth); - memcpy(dst_y, rows, width); - SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth); - memcpy(dst_y + dst_stride_y, rows, width); - InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); - src_yuy2 += src_stride_yuy2 * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - // Split Y from UV. - SplitUVRow(src_yuy2, rows, dst_uv, awidth); - memcpy(dst_y, rows, width); - } - free_aligned_buffer_64(rows); + for (y = 0; y < height - 1; y += 2) { + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width); } return 0; } @@ -5177,7 +5526,7 @@ int UYVYToNV12(const uint8_t* src_uyvy, int halfwidth = (width + 1) >> 1; void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; @@ -5231,6 +5580,12 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitUVRow = SplitUVRow_RVV; + } +#endif + #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -5271,6 +5626,11 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif { int awidth = halfwidth * 2; @@ -5336,6 +5696,7 @@ void HalfMergeUVPlane(const uint8_t* src_u, HalfMergeUVRow = HalfMergeUVRow_AVX2; } #endif + for (y = 0; y < height - 1; y += 2) { // Merge a row of U and V into a row of UV. HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); diff --git a/files/source/rotate.cc b/files/source/rotate.cc index f1e83cbd..8d3978c7 100644 --- a/files/source/rotate.cc +++ b/files/source/rotate.cc @@ -138,7 +138,7 @@ void RotatePlane180(const uint8_t* src, int dst_stride, int width, int height) { - // Swap first and last row and mirror the content. Uses a temporary row. + // Swap top and bottom row and mirror the content. Uses a temporary row. align_buffer_64(row, width); const uint8_t* src_bot = src + src_stride * (height - 1); uint8_t* dst_bot = dst + dst_stride * (height - 1); @@ -178,6 +178,14 @@ void RotatePlane180(const uint8_t* src, } } #endif +#if defined(HAS_MIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MirrorRow = MirrorRow_Any_LSX; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_LSX; + } + } +#endif #if defined(HAS_MIRRORROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { MirrorRow = MirrorRow_Any_LASX; @@ -206,12 +214,17 @@ void RotatePlane180(const uint8_t* src, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { - CopyRow(src, row, width); // Copy first row into buffer - MirrorRow(src_bot, dst, width); // Mirror last row into first row - MirrorRow(row, dst_bot, width); // Mirror buffer into last row + CopyRow(src, row, width); // Copy top row into buffer + MirrorRow(src_bot, dst, width); // Mirror bottom row into top row + MirrorRow(row, dst_bot, width); // Mirror buffer into bottom row src += src_stride; dst += dst_stride; src_bot -= src_stride; @@ -476,6 +489,120 @@ int RotatePlane(const uint8_t* src, return -1; } +LIBYUV_API +void TransposePlane_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + int i = height; + // Work across the source in 8x8 tiles + while (i >= 8) { + TransposeWx8_16_C(src, src_stride, dst, dst_stride, width); + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. + i -= 8; + } + + if (i > 0) { + TransposeWxH_16_C(src, src_stride, dst, dst_stride, width, i); + } +} + +static void RotatePlane90_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + // Rotate by 90 is a transpose with the source read + // from bottom to top. So set the source pointer to the end + // of the buffer and flip the sign of the source stride. + src += src_stride * (height - 1); + src_stride = -src_stride; + TransposePlane_16(src, src_stride, dst, dst_stride, width, height); +} + +static void RotatePlane270_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + // Rotate by 270 is a transpose with the destination written + // from bottom to top. So set the destination pointer to the end + // of the buffer and flip the sign of the destination stride. + dst += dst_stride * (width - 1); + dst_stride = -dst_stride; + TransposePlane_16(src, src_stride, dst, dst_stride, width, height); +} + +static void RotatePlane180_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + // Swap top and bottom row and mirror the content. Uses a temporary row. + align_buffer_64_16(row, width); + const uint16_t* src_bot = src + src_stride * (height - 1); + uint16_t* dst_bot = dst + dst_stride * (height - 1); + int half_height = (height + 1) >> 1; + int y; + + // Odd height will harmlessly mirror the middle row twice. + for (y = 0; y < half_height; ++y) { + CopyRow_16_C(src, row, width); // Copy top row into buffer + MirrorRow_16_C(src_bot, dst, width); // Mirror bottom row into top row + MirrorRow_16_C(row, dst_bot, width); // Mirror buffer into bottom row + src += src_stride; + dst += dst_stride; + src_bot -= src_stride; + dst_bot -= dst_stride; + } + free_aligned_buffer_64_16(row); +} + +LIBYUV_API +int RotatePlane_16(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height, + enum RotationMode mode) { + if (!src || width <= 0 || height == 0 || !dst) { + return -1; + } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane_16(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate90: + RotatePlane90_16(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate270: + RotatePlane270_16(src, src_stride, dst, dst_stride, width, height); + return 0; + case kRotate180: + RotatePlane180_16(src, src_stride, dst, dst_stride, width, height); + return 0; + default: + break; + } + return -1; +} + LIBYUV_API int I420Rotate(const uint8_t* src_y, int src_stride_y, @@ -544,6 +671,8 @@ int I420Rotate(const uint8_t* src_y, return -1; } +// I422 has half width x full height UV planes, so rotate by 90 and 270 +// require scaling to maintain 422 subsampling. LIBYUV_API int I422Rotate(const uint8_t* src_y, int src_stride_y, @@ -579,31 +708,42 @@ int I422Rotate(const uint8_t* src_y, switch (mode) { case kRotate0: - // copy frame + // Copy frame CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); return 0; + + // Note on temporary Y plane for UV. + // Rotation of UV first fits within the Y destination plane rows. + // Y plane is width x height + // Y plane rotated is height x width + // UV plane is (width / 2) x height + // UV plane rotated is height x (width / 2) + // UV plane rotated+scaled is (height / 2) x width. + // UV plane rotated is a temporary that fits within the Y plane rotated. + case kRotate90: - // We need to rotate and rescale, we use plane Y as temporal storage. - RotatePlane90(src_u, src_stride_u, dst_y, height, halfwidth, height); - ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight, + RotatePlane90(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u, halfheight, width, kFilterBilinear); - RotatePlane90(src_v, src_stride_v, dst_y, height, halfwidth, height); - ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight, + RotatePlane90(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v, halfheight, width, kFilterLinear); RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); return 0; case kRotate270: - // We need to rotate and rescale, we use plane Y as temporal storage. - RotatePlane270(src_u, src_stride_u, dst_y, height, halfwidth, height); - ScalePlane(dst_y, height, height, halfwidth, dst_u, halfheight, + RotatePlane270(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u, halfheight, width, kFilterBilinear); - RotatePlane270(src_v, src_stride_v, dst_y, height, halfwidth, height); - ScalePlane(dst_y, height, height, halfwidth, dst_v, halfheight, + RotatePlane270(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v, halfheight, width, kFilterLinear); RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - return 0; case kRotate180: RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); @@ -828,6 +968,228 @@ int Android420ToI420Rotate(const uint8_t* src_y, return -1; } +LIBYUV_API +int I010Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v || dst_stride_y < 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + return I010Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height); + case kRotate90: + RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate270: + RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + case kRotate180: + RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); + return 0; + default: + break; + } + return -1; +} + +// I210 has half width x full height UV planes, so rotate by 90 and 270 +// require scaling to maintain 422 subsampling. +LIBYUV_API +int I210Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // Copy frame + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); + return 0; + + // Note on temporary Y plane for UV. + // Rotation of UV first fits within the Y destination plane rows. + // Y plane is width x height + // Y plane rotated is height x width + // UV plane is (width / 2) x height + // UV plane rotated is height x (width / 2) + // UV plane rotated+scaled is (height / 2) x width. + // UV plane rotated is a temporary that fits within the Y plane rotated. + + case kRotate90: + RotatePlane90_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u, + halfheight, width, kFilterBilinear); + RotatePlane90_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v, + halfheight, width, kFilterLinear); + RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; + case kRotate270: + RotatePlane270_16(src_u, src_stride_u, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_u, dst_stride_u, + halfheight, width, kFilterBilinear); + RotatePlane270_16(src_v, src_stride_v, dst_y, dst_stride_y, halfwidth, + height); + ScalePlane_16(dst_y, dst_stride_y, height, halfwidth, dst_v, dst_stride_v, + halfheight, width, kFilterLinear); + RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + return 0; + case kRotate180: + RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + height); + RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + height); + return 0; + default: + break; + } + return -1; +} + +LIBYUV_API +int I410Rotate(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode) { + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v || dst_stride_y < 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + switch (mode) { + case kRotate0: + // copy frame + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case kRotate90: + RotatePlane90_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90_16(src_u, src_stride_u, dst_u, dst_stride_u, width, height); + RotatePlane90_16(src_v, src_stride_v, dst_v, dst_stride_v, width, height); + return 0; + case kRotate270: + RotatePlane270_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane270_16(src_u, src_stride_u, dst_u, dst_stride_u, width, + height); + RotatePlane270_16(src_v, src_stride_v, dst_v, dst_stride_v, width, + height); + return 0; + case kRotate180: + RotatePlane180_16(src_y, src_stride_y, dst_y, dst_stride_y, width, + height); + RotatePlane180_16(src_u, src_stride_u, dst_u, dst_stride_u, width, + height); + RotatePlane180_16(src_v, src_stride_v, dst_v, dst_stride_v, width, + height); + return 0; + default: + break; + } + return -1; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc index 539cf98d..c7239010 100644 --- a/files/source/rotate_argb.cc +++ b/files/source/rotate_argb.cc @@ -8,11 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/rotate.h" +#include "libyuv/rotate_argb.h" #include "libyuv/convert.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" #include "libyuv/row.h" #include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */ @@ -155,6 +156,14 @@ static int ARGBRotate180(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBMirrorRow = ARGBMirrorRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_LSX; + } + } +#endif #if defined(HAS_ARGBMIRRORROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBMirrorRow = ARGBMirrorRow_Any_LASX; @@ -183,6 +192,11 @@ static int ARGBRotate180(const uint8_t* src_argb, CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc index ff212ade..4b496d1b 100644 --- a/files/source/rotate_common.cc +++ b/files/source/rotate_common.cc @@ -94,12 +94,135 @@ void TransposeUVWxH_C(const uint8_t* src, for (i = 0; i < width * 2; i += 2) { int j; for (j = 0; j < height; ++j) { - dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; - dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; + dst_a[((i >> 1) * dst_stride_a) + j] = src[i + (j * src_stride)]; + dst_b[((i >> 1) * dst_stride_b) + j] = src[i + (j * src_stride) + 1]; } } } +void TransposeWx8_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +void TransposeUVWx8_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst_a, + int dst_stride_a, + uint16_t* dst_b, + int dst_stride_b, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +void TransposeWxH_16_C(const uint16_t* src, + int src_stride, + uint16_t* dst, + int dst_stride, + int width, + int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src1 = src + src_stride; + const uint8_t* src2 = src1 + src_stride; + const uint8_t* src3 = src2 + src_stride; + uint8_t* dst1 = dst + dst_stride; + uint8_t* dst2 = dst1 + dst_stride; + uint8_t* dst3 = dst2 + dst_stride; + int i; + for (i = 0; i < width; i += 4) { + uint32_t p00 = ((uint32_t*)(src))[0]; + uint32_t p10 = ((uint32_t*)(src))[1]; + uint32_t p20 = ((uint32_t*)(src))[2]; + uint32_t p30 = ((uint32_t*)(src))[3]; + uint32_t p01 = ((uint32_t*)(src1))[0]; + uint32_t p11 = ((uint32_t*)(src1))[1]; + uint32_t p21 = ((uint32_t*)(src1))[2]; + uint32_t p31 = ((uint32_t*)(src1))[3]; + uint32_t p02 = ((uint32_t*)(src2))[0]; + uint32_t p12 = ((uint32_t*)(src2))[1]; + uint32_t p22 = ((uint32_t*)(src2))[2]; + uint32_t p32 = ((uint32_t*)(src2))[3]; + uint32_t p03 = ((uint32_t*)(src3))[0]; + uint32_t p13 = ((uint32_t*)(src3))[1]; + uint32_t p23 = ((uint32_t*)(src3))[2]; + uint32_t p33 = ((uint32_t*)(src3))[3]; + ((uint32_t*)(dst))[0] = p00; + ((uint32_t*)(dst))[1] = p01; + ((uint32_t*)(dst))[2] = p02; + ((uint32_t*)(dst))[3] = p03; + ((uint32_t*)(dst1))[0] = p10; + ((uint32_t*)(dst1))[1] = p11; + ((uint32_t*)(dst1))[2] = p12; + ((uint32_t*)(dst1))[3] = p13; + ((uint32_t*)(dst2))[0] = p20; + ((uint32_t*)(dst2))[1] = p21; + ((uint32_t*)(dst2))[2] = p22; + ((uint32_t*)(dst2))[3] = p23; + ((uint32_t*)(dst3))[0] = p30; + ((uint32_t*)(dst3))[1] = p31; + ((uint32_t*)(dst3))[2] = p32; + ((uint32_t*)(dst3))[3] = p33; + src += src_stride * 4; // advance 4 rows + src1 += src_stride * 4; + src2 += src_stride * 4; + src3 += src_stride * 4; + dst += 4 * 4; // advance 4 columns + dst1 += 4 * 4; + dst2 += 4 * 4; + dst3 += 4 * 4; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc index 1a3f8cbb..fd5eee05 100644 --- a/files/source/rotate_gcc.cc +++ b/files/source/rotate_gcc.cc @@ -365,6 +365,136 @@ void TransposeUVWx8_SSE2(const uint8_t* src, "xmm7", "xmm8", "xmm9"); } #endif // defined(HAS_TRANSPOSEUVWX8_SSE2) + +#if defined(HAS_TRANSPOSE4X4_32_SSE2) +// 4 values, little endian view +// a b c d +// e f g h +// i j k l +// m n o p + +// transpose 2x2 +// a e b f from row 0, 1 +// i m j n from row 2, 3 +// c g d h from row 0, 1 +// k o l p from row 2, 3 + +// transpose 4x4 +// a e i m from row 0, 1 +// b f j n from row 0, 1 +// c g k o from row 2, 3 +// d h l p from row 2, 3 + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Main loop transpose 4x4. Read a column, write a row. + "1: \n" + "movdqu (%0),%%xmm0 \n" // a b c d + "movdqu (%0,%3),%%xmm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "movdqu (%0),%%xmm2 \n" // i j k l + "movdqu (%0,%3),%%xmm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + // Transpose 2x2 + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "movdqa %%xmm0,%%xmm6 \n" + "movdqa %%xmm2,%%xmm7 \n" + "punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1 + "punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3 + "punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1 + "punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3 + + // Transpose 4x4 + "movdqa %%xmm4,%%xmm0 \n" + "movdqa %%xmm4,%%xmm1 \n" + "movdqa %%xmm6,%%xmm2 \n" + "movdqa %%xmm6,%%xmm3 \n" + "punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1 + "punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1 + "punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3 + "punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3 + + "movdqu %%xmm0,(%1) \n" + "lea 16(%1,%4),%1 \n" // dst += stride + 16 + "movdqu %%xmm1,-16(%1) \n" + "movdqu %%xmm2,-16(%1,%4) \n" + "movdqu %%xmm3,-16(%1,%4,2) \n" + "sub %4,%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+rm"(width) // %2 + : "r"((ptrdiff_t)(src_stride)), // %3 + "r"((ptrdiff_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // defined(HAS_TRANSPOSE4X4_32_SSE2) + +#if defined(HAS_TRANSPOSE4X4_32_AVX2) + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_AVX2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Main loop transpose 2 blocks of 4x4. Read a column, write a row. + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // a b c d + "vmovdqu (%0,%3),%%xmm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "vmovdqu (%0),%%xmm2 \n" // i j k l + "vmovdqu (%0,%3),%%xmm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d + "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l + "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + // Transpose 2x2 + "vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1 + "vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3 + "vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1 + "vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3 + + // Transpose 4x4 + "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1 + "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1 + "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3 + "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3 + + "vmovdqu %%ymm0,(%1) \n" + "lea 32(%1,%4),%1 \n" // dst += stride + 32 + "vmovdqu %%ymm1,-32(%1) \n" + "vmovdqu %%ymm2,-32(%1,%4) \n" + "vmovdqu %%ymm3,-32(%1,%4,2) \n" + "sub %4,%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+rm"(width) // %2 + : "r"((ptrdiff_t)(src_stride)), // %3 + "r"((ptrdiff_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // defined(HAS_TRANSPOSE4X4_32_AVX2) + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/files/source/rotate_mmi.cc b/files/source/rotate_mmi.cc deleted file mode 100644 index f8de6083..00000000 --- a/files/source/rotate_mmi.cc +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/rotate_row.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -void TransposeWx8_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; - uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; - uint8_t* src_tmp = nullptr; - - __asm__ volatile( - "1: \n\t" - "ldc1 %[tmp12], 0x00(%[src]) \n\t" - "dadd %[src_tmp], %[src], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (00 10 01 11 02 12 03 13) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (04 14 05 15 06 16 07 17) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (20 30 21 31 22 32 23 33) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (24 34 25 35 26 36 27 37) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp4 = (00 10 20 30 01 11 21 31) */ - "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" - /* tmp5 = (02 12 22 32 03 13 23 33) */ - "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" - /* tmp6 = (04 14 24 34 05 15 25 35) */ - "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" - /* tmp7 = (06 16 26 36 07 17 27 37) */ - "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (40 50 41 51 42 52 43 53) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (44 54 45 55 46 56 47 57) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (60 70 61 71 62 72 63 73) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (64 74 65 75 66 76 67 77) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp8 = (40 50 60 70 41 51 61 71) */ - "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" - /* tmp9 = (42 52 62 72 43 53 63 73) */ - "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" - /* tmp10 = (44 54 64 74 45 55 65 75) */ - "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" - /* tmp11 = (46 56 66 76 47 57 67 77) */ - "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" - - /* tmp0 = (00 10 20 30 40 50 60 70) */ - "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" - /* tmp1 = (01 11 21 31 41 51 61 71) */ - "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (02 12 22 32 42 52 62 72) */ - "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" - /* tmp1 = (03 13 23 33 43 53 63 73) */ - "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (04 14 24 34 44 54 64 74) */ - "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" - /* tmp1 = (05 15 25 35 45 55 65 75) */ - "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - /* tmp0 = (06 16 26 36 46 56 66 76) */ - "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" - /* tmp1 = (07 17 27 37 47 57 67 77) */ - "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" - - "dadd %[dst], %[dst], %[dst_stride] \n\t" - "daddi %[src], %[src], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - - : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), - [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), - [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), - [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst), - [src_tmp] "+&r"(src_tmp) - : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride), - [dst_stride] "r"(dst_stride) - : "memory"); -} - -void TransposeUVWx8_MMI(const uint8_t* src, - int src_stride, - uint8_t* dst_a, - int dst_stride_a, - uint8_t* dst_b, - int dst_stride_b, - int width) { - uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; - uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; - uint8_t* src_tmp = nullptr; - - __asm__ volatile( - "1: \n\t" - /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */ - "ldc1 %[tmp12], 0x00(%[src]) \n\t" - "dadd %[src_tmp], %[src], %[src_stride] \n\t" - /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */ - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */ - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */ - "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" - /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */ - "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" - /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */ - "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" - /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */ - "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */ - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */ - "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" - /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */ - "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" - - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */ - "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" - /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */ - "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" - "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" - - /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */ - "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" - /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */ - "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" - - /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */ - "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" - /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */ - "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" - /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */ - "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" - /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */ - "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" - - /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */ - "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" - /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */ - "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */ - "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" - /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */ - "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */ - "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" - /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */ - "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */ - "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" - /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */ - "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" - "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" - "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" - - "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" - "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" - "daddiu %[src], %[src], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), - [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), - [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), - [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a), - [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp) - : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a), - [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride) - : "memory"); -} - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc index 844df2bf..569a7318 100644 --- a/files/source/rotate_neon.cc +++ b/files/source/rotate_neon.cc @@ -410,6 +410,46 @@ void TransposeUVWx8_NEON(const uint8_t* src, : "r"(&kVTbl4x4TransposeDi) // %8 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src1 = src + src_stride; + const uint8_t* src2 = src1 + src_stride; + const uint8_t* src3 = src2 + src_stride; + uint8_t* dst1 = dst + dst_stride; + uint8_t* dst2 = dst1 + dst_stride; + uint8_t* dst3 = dst2 + dst_stride; + asm volatile( + // Main loop transpose 4x4. Read a column, write a row. + "1: \n" + "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" + "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n" + "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n" + "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n" + "subs %8, %8, #4 \n" // w -= 4 + "vst1.8 {q0}, [%4]! \n" + "vst1.8 {q1}, [%5]! \n" + "vst1.8 {q2}, [%6]! \n" + "vst1.8 {q3}, [%7]! \n" + "bgt 1b \n" + + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(dst1), // %5 + "+r"(dst2), // %6 + "+r"(dst3), // %7 + "+r"(width) // %8 + : "r"((ptrdiff_t)(src_stride * 4)) // %9 + : "memory", "cc", "q0", "q1", "q2", "q3"); +} + #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc index 43c15817..95047fa7 100644 --- a/files/source/rotate_neon64.cc +++ b/files/source/rotate_neon64.cc @@ -201,13 +201,13 @@ void TransposeWx8_NEON(const uint8_t* src, "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst), // %2 - "+r"(width) // %3 - : "r"(&kVTbl4x4Transpose), // %4 - "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride)) // %6 + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"((ptrdiff_t)src_stride), // %5 + "r"((ptrdiff_t)dst_stride) // %6 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } @@ -423,18 +423,57 @@ void TransposeUVWx8_NEON(const uint8_t* src, "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst_a), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride_a)), // %6 - "r"(static_cast(dst_stride_b)), // %7 - "r"(&kVTbl4x4TransposeDi) // %8 + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"((ptrdiff_t)src_stride), // %5 + "r"((ptrdiff_t)dst_stride_a), // %6 + "r"((ptrdiff_t)dst_stride_b), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); } + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src1 = src + src_stride; + const uint8_t* src2 = src1 + src_stride; + const uint8_t* src3 = src2 + src_stride; + uint8_t* dst1 = dst + dst_stride; + uint8_t* dst2 = dst1 + dst_stride; + uint8_t* dst3 = dst2 + dst_stride; + asm volatile( + // Main loop transpose 4x4. Read a column, write a row. + "1: \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n" + "ld4 {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n" + "subs %w8, %w8, #4 \n" // w -= 4 + "st1 {v0.4s}, [%4], 16 \n" + "st1 {v1.4s}, [%5], 16 \n" + "st1 {v2.4s}, [%6], 16 \n" + "st1 {v3.4s}, [%7], 16 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(dst1), // %5 + "+r"(dst2), // %6 + "+r"(dst3), // %7 + "+r"(width) // %8 + : "r"((ptrdiff_t)(src_stride * 4)) // %9 + : "memory", "cc", "v0", "v1", "v2", "v3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/files/source/row_any.cc b/files/source/row_any.cc index 3781a9f2..e574543c 100644 --- a/files/source/row_any.cc +++ b/files/source/row_any.cc @@ -19,7 +19,7 @@ namespace libyuv { extern "C" { #endif -// memset for temp is meant to clear the source buffer (not dest) so that +// memset for vin is meant to clear the source buffer so that // SIMD that reads full multiple of 16 bytes will not trigger msan errors. // memset is not needed for production, as the garbage values are processed but // not used, although there may be edge cases for subsampling. @@ -35,20 +35,20 @@ extern "C" { void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 5]); \ - memset(temp, 0, 64 * 4); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[64 * 4]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n); \ } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 192, a_buf + n, r); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ - SS(r, DUVSHIFT) * BPP); \ + memcpy(vin, y_buf + n, r); \ + memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 192, a_buf + n, r); \ + ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_MERGEARGBROW_SSE2 @@ -68,25 +68,25 @@ ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 5]); \ - memset(temp, 0, 64 * 4); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[64 * 4]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 192, a_buf + n, r); \ + memcpy(vin, y_buf + n, r); \ + memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 192, a_buf + n, r); \ if (width & 1) { \ - temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ - temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + vin[64 + SS(r, UVSHIFT)] = vin[64 + SS(r, UVSHIFT) - 1]; \ + vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1]; \ } \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ - SS(r, DUVSHIFT) * BPP); \ + ANY_SIMD(vin, vin + 64, vin + 128, vin + 192, vout, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I444ALPHATOARGBROW_SSSE3 @@ -113,6 +113,9 @@ ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7) #ifdef HAS_I422ALPHATOARGBROW_MSA ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) #endif +#ifdef HAS_I422ALPHATOARGBROW_LSX +ANY41C(I422AlphaToARGBRow_Any_LSX, I422AlphaToARGBRow_LSX, 1, 0, 4, 15) +#endif #ifdef HAS_I422ALPHATOARGBROW_LASX ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15) #endif @@ -123,21 +126,20 @@ ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15) void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ int width) { \ - SIMD_ALIGNED(T temp[16 * 4]); \ - SIMD_ALIGNED(uint8_t out[64]); \ - memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ + SIMD_ALIGNED(T vin[16 * 4]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ } \ - memcpy(temp, y_buf + n, r * SBPP); \ - memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 48, a_buf + n, r * SBPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants, \ - MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + memcpy(vin, y_buf + n, r * SBPP); \ + memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(vin + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I210ALPHATOARGBROW_SSSE3 @@ -190,20 +192,20 @@ ANY41CT(I410AlphaToARGBRow_Any_AVX2, #define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \ - SIMD_ALIGNED(STYPE temp[16 * 4]); \ - SIMD_ALIGNED(DTYPE out[64]); \ - memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ + SIMD_ALIGNED(STYPE vin[16 * 4]); \ + SIMD_ALIGNED(DTYPE vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \ } \ - memcpy(temp, r_buf + n, r * SBPP); \ - memcpy(temp + 16, g_buf + n, r * SBPP); \ - memcpy(temp + 32, b_buf + n, r * SBPP); \ - memcpy(temp + 48, a_buf + n, r * SBPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \ - memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ + memcpy(vin, r_buf + n, r * SBPP); \ + memcpy(vin + 16, g_buf + n, r * SBPP); \ + memcpy(vin + 32, b_buf + n, r * SBPP); \ + memcpy(vin + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(vin, vin + 16, vin + 32, vin + 48, vout, depth, MASK + 1); \ + memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP); \ } #ifdef HAS_MERGEAR64ROW_AVX2 @@ -237,22 +239,22 @@ ANY41PT(MergeARGB16To8Row_Any_NEON, #undef ANY41PT // Any 3 planes to 1. -#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ - const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[64 * 3]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ + } \ + memcpy(vin, y_buf + n, r); \ + memcpy(vin + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(vin, vin + 64, vin + 128, vout, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } // Merge functions. @@ -285,6 +287,9 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #ifdef HAS_I422TOYUY2ROW_MSA ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) #endif +#ifdef HAS_I422TOYUY2ROW_LSX +ANY31(I422ToYUY2Row_Any_LSX, I422ToYUY2Row_LSX, 1, 1, 4, 15) +#endif #ifdef HAS_I422TOYUY2ROW_LASX ANY31(I422ToYUY2Row_Any_LASX, I422ToYUY2Row_LASX, 1, 1, 4, 31) #endif @@ -294,6 +299,9 @@ ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #ifdef HAS_I422TOUYVYROW_MSA ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) #endif +#ifdef HAS_I422TOUYVYROW_LSX +ANY31(I422ToUYVYRow_Any_LSX, I422ToUYVYRow_LSX, 1, 1, 4, 15) +#endif #ifdef HAS_I422TOUYVYROW_LASX ANY31(I422ToUYVYRow_Any_LASX, I422ToUYVYRow_LASX, 1, 1, 4, 31) #endif @@ -308,28 +316,27 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) // Note that odd width replication includes 444 due to implementation // on arm that subsamples 444 to 422 internally. // Any 3 planes to 1 with yuvconstants -#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ - const uint8_t* v_buf, uint8_t* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 4]); \ - memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - if (width & 1) { \ - temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ - temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \ - } \ - ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \ - MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \ - SS(r, DUVSHIFT) * BPP); \ +#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t vin[128 * 3]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, y_buf + n, r); \ + memcpy(vin + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(vin + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + if (width & 1) { \ + vin[128 + SS(r, UVSHIFT)] = vin[128 + SS(r, UVSHIFT) - 1]; \ + vin[256 + SS(r, UVSHIFT)] = vin[256 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(vin, vin + 128, vin + 256, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I422TOARGBROW_SSSE3 @@ -359,6 +366,9 @@ ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) #endif +#ifdef HAS_I444TORGB24ROW_SSSE3 +ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15) +#endif #ifdef HAS_I422TORGB24ROW_AVX2 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif @@ -374,6 +384,9 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #ifdef HAS_I444TOARGBROW_AVX2 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) #endif +#ifdef HAS_I444TORGB24ROW_AVX2 +ANY31C(I444ToRGB24Row_Any_AVX2, I444ToRGB24Row_AVX2, 0, 0, 3, 31) +#endif #ifdef HAS_I422TOARGB4444ROW_AVX2 ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15) #endif @@ -383,6 +396,9 @@ ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15) #ifdef HAS_I422TORGB565ROW_AVX2 ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15) #endif +#ifdef HAS_I444TORGB24ROW_NEON +ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7) +#endif #ifdef HAS_I422TOARGBROW_NEON ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) @@ -401,6 +417,14 @@ ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) #endif +#ifdef HAS_I422TOARGBROW_LSX +ANY31C(I422ToARGBRow_Any_LSX, I422ToARGBRow_LSX, 1, 0, 4, 15) +ANY31C(I422ToRGBARow_Any_LSX, I422ToRGBARow_LSX, 1, 0, 4, 15) +ANY31C(I422ToRGB24Row_Any_LSX, I422ToRGB24Row_LSX, 1, 0, 3, 15) +ANY31C(I422ToRGB565Row_Any_LSX, I422ToRGB565Row_LSX, 1, 0, 2, 15) +ANY31C(I422ToARGB4444Row_Any_LSX, I422ToARGB4444Row_LSX, 1, 0, 2, 15) +ANY31C(I422ToARGB1555Row_Any_LSX, I422ToARGB1555Row_LSX, 1, 0, 2, 15) +#endif #ifdef HAS_I422TOARGBROW_LASX ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31) ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31) @@ -420,19 +444,19 @@ ANY31C(I444ToARGBRow_Any_LSX, I444ToARGBRow_LSX, 0, 0, 4, 15) void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ int width) { \ - SIMD_ALIGNED(T temp[16 * 3]); \ - SIMD_ALIGNED(uint8_t out[64]); \ - memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + SIMD_ALIGNED(T vin[16 * 3]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ } \ - memcpy(temp, y_buf + n, r * SBPP); \ - memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + memcpy(vin, y_buf + n, r * SBPP); \ + memcpy(vin + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(vin + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(vin, vin + 16, vin + 32, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I210TOAR30ROW_SSSE3 @@ -477,19 +501,19 @@ ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ DTYPE* dst_ptr, int depth, int width) { \ - SIMD_ALIGNED(STYPE temp[16 * 3]); \ - SIMD_ALIGNED(DTYPE out[64]); \ - memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + SIMD_ALIGNED(STYPE vin[16 * 3]); \ + SIMD_ALIGNED(DTYPE vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \ } \ - memcpy(temp, r_buf + n, r * SBPP); \ - memcpy(temp + 16, g_buf + n, r * SBPP); \ - memcpy(temp + 32, b_buf + n, r * SBPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \ - memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ + memcpy(vin, r_buf + n, r * SBPP); \ + memcpy(vin + 16, g_buf + n, r * SBPP); \ + memcpy(vin + 32, b_buf + n, r * SBPP); \ + ANY_SIMD(vin, vin + 16, vin + 32, vout, depth, MASK + 1); \ + memcpy((uint8_t*)dst_ptr + n * BPP, vout, r * BPP); \ } #ifdef HAS_MERGEXR30ROW_AVX2 @@ -541,18 +565,19 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON, #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 3]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + memcpy(vin, y_buf + n * SBPP, r * SBPP); \ + memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ + ANY_SIMD(vin, vin + 128, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ } // Merge functions. @@ -560,7 +585,10 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON, ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 -ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) +ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_AVX512BW +ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31) #endif #ifdef HAS_MERGEUVROW_NEON ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) @@ -611,18 +639,27 @@ ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBMULTIPLYROW_MSA ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) #endif +#ifdef HAS_ARGBMULTIPLYROW_LSX +ANY21(ARGBMultiplyRow_Any_LSX, ARGBMultiplyRow_LSX, 0, 4, 4, 4, 3) +#endif #ifdef HAS_ARGBMULTIPLYROW_LASX ANY21(ARGBMultiplyRow_Any_LASX, ARGBMultiplyRow_LASX, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBADDROW_MSA ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBADDROW_LSX +ANY21(ARGBAddRow_Any_LSX, ARGBAddRow_LSX, 0, 4, 4, 4, 3) +#endif #ifdef HAS_ARGBADDROW_LASX ANY21(ARGBAddRow_Any_LASX, ARGBAddRow_LASX, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBSUBTRACTROW_MSA ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBSUBTRACTROW_LSX +ANY21(ARGBSubtractRow_Any_LSX, ARGBSubtractRow_LSX, 0, 4, 4, 4, 3) +#endif #ifdef HAS_ARGBSUBTRACTROW_LASX ANY21(ARGBSubtractRow_Any_LASX, ARGBSubtractRow_LASX, 0, 4, 4, 4, 7) #endif @@ -664,22 +701,53 @@ ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15) #endif #undef ANY21 +// Any 2 planes to 1 with stride +// width is measured in source pixels. 4 bytes contains 2 pixels +#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \ + int width) { \ + SIMD_ALIGNED(uint8_t vin[32 * 2]); \ + SIMD_ALIGNED(uint8_t vout[32]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int awidth = (width + 1) / 2; \ + int r = awidth & MASK; \ + int n = awidth & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2); \ + } \ + memcpy(vin, src_yuy2 + n * SBPP, r * SBPP); \ + memcpy(vin + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP); \ + ANY_SIMD(vin, 32, vout, MASK + 1); \ + memcpy(dst_uv + n * BPP, vout, r * BPP); \ + } + +#ifdef HAS_YUY2TONVUVROW_NEON +ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7) +#endif +#ifdef HAS_YUY2TONVUVROW_SSE2 +ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7) +#endif +#ifdef HAS_YUY2TONVUVROW_AVX2 +ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15) +#endif + // Any 2 planes to 1 with yuvconstants #define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 3]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + memcpy(vin, y_buf + n * SBPP, r * SBPP); \ + memcpy(vin + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ + ANY_SIMD(vin, vin + 128, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ } // Biplanar to RGB. @@ -758,21 +826,21 @@ ANY21C(NV12ToRGB565Row_Any_LASX, NV12ToRGB565Row_LASX, 1, 1, 2, 2, 15) #undef ANY21C // Any 2 planes of 16 bit to 1 with yuvconstants -#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ - void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(T temp[16 * 3]); \ - SIMD_ALIGNED(uint8_t out[64]); \ - memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r * SBPP); \ - memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \ - ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ +#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(T vin[16 * 2]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, y_buf + n, r * SBPP); \ + memcpy(vin + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \ + ANY_SIMD(vin, vin + 16, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, vout, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_P210TOAR30ROW_SSSE3 @@ -806,21 +874,22 @@ ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) #define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \ int width) { \ - SIMD_ALIGNED(T temp[16 * 4]); \ - memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ + SIMD_ALIGNED(T vin[16 * 2]); \ + SIMD_ALIGNED(T vout[16]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_u, src_v, dst_uv, depth, n); \ } \ - memcpy(temp, src_u + n, r * BPP); \ - memcpy(temp + 16, src_v + n, r * BPP); \ - ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1); \ - memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2); \ + memcpy(vin, src_u + n, r * BPP); \ + memcpy(vin + 16, src_v + n, r * BPP); \ + ANY_SIMD(vin, vin + 16, vout, depth, MASK + 1); \ + memcpy(dst_uv + n * 2, vout, r * BPP * 2); \ } #ifdef HAS_MERGEUVROW_16_AVX2 -ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15) +ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7) #endif #ifdef HAS_MERGEUVROW_16_NEON ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) @@ -829,18 +898,19 @@ ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) #undef ANY21CT // Any 1 to 1. -#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(vin, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ } #ifdef HAS_COPYROW_AVX @@ -931,6 +1001,13 @@ ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) #endif +#if defined(HAS_ARGBTORGB24ROW_LSX) +ANY11(ARGBToRGB24Row_Any_LSX, ARGBToRGB24Row_LSX, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_LSX, ARGBToRAWRow_LSX, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_LSX, ARGBToRGB565Row_LSX, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_LSX, ARGBToARGB1555Row_LSX, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_LSX, ARGBToARGB4444Row_LSX, 0, 4, 2, 7) +#endif #if defined(HAS_ARGBTORGB24ROW_LASX) ANY11(ARGBToRGB24Row_Any_LASX, ARGBToRGB24Row_LASX, 0, 4, 3, 31) ANY11(ARGBToRAWRow_Any_LASX, ARGBToRAWRow_LASX, 0, 4, 3, 31) @@ -959,6 +1036,9 @@ ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) #ifdef HAS_ARGBTOYJROW_AVX2 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) #endif +#ifdef HAS_ABGRTOYJROW_AVX2 +ANY11(ABGRToYJRow_Any_AVX2, ABGRToYJRow_AVX2, 0, 4, 1, 31) +#endif #ifdef HAS_RGBATOYJROW_AVX2 ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31) #endif @@ -983,6 +1063,9 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) #ifdef HAS_ARGBTOYJROW_SSSE3 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #endif +#ifdef HAS_ABGRTOYJROW_SSSE3 +ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15) +#endif #ifdef HAS_RGBATOYJROW_SSSE3 ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) #endif @@ -992,12 +1075,18 @@ ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15) #ifdef HAS_ARGBTOYROW_MSA ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_LSX +ANY11(ARGBToYRow_Any_LSX, ARGBToYRow_LSX, 0, 4, 1, 15) +#endif #ifdef HAS_ARGBTOYROW_LASX ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31) #endif #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ABGRTOYJROW_NEON +ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15) +#endif #ifdef HAS_RGBATOYJROW_NEON ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15) #endif @@ -1007,9 +1096,21 @@ ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) #ifdef HAS_ARGBTOYJROW_LSX ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15) #endif +#ifdef HAS_RGBATOYJROW_LSX +ANY11(RGBAToYJRow_Any_LSX, RGBAToYJRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_ABGRTOYJROW_LSX +ANY11(ABGRToYJRow_Any_LSX, ABGRToYJRow_LSX, 0, 4, 1, 15) +#endif +#ifdef HAS_RGBATOYJROW_LASX +ANY11(RGBAToYJRow_Any_LASX, RGBAToYJRow_LASX, 0, 4, 1, 31) +#endif #ifdef HAS_ARGBTOYJROW_LASX ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31) #endif +#ifdef HAS_ABGRTOYJROW_LASX +ANY11(ABGRToYJRow_Any_LASX, ABGRToYJRow_LASX, 0, 4, 1, 31) +#endif #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15) #endif @@ -1019,6 +1120,9 @@ ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) #ifdef HAS_BGRATOYROW_LSX ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15) #endif +#ifdef HAS_BGRATOYROW_LASX +ANY11(BGRAToYRow_Any_LASX, BGRAToYRow_LASX, 0, 4, 1, 31) +#endif #ifdef HAS_ABGRTOYROW_NEON ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15) #endif @@ -1028,6 +1132,9 @@ ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) #ifdef HAS_ABGRTOYROW_LSX ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15) #endif +#ifdef HAS_ABGRTOYROW_LASX +ANY11(ABGRToYRow_Any_LASX, ABGRToYRow_LASX, 0, 4, 1, 31) +#endif #ifdef HAS_RGBATOYROW_NEON ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15) #endif @@ -1037,6 +1144,9 @@ ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) #ifdef HAS_RGBATOYROW_LSX ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15) #endif +#ifdef HAS_RGBATOYROW_LASX +ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31) +#endif #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15) #endif @@ -1055,6 +1165,12 @@ ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) #ifdef HAS_RGB24TOYROW_LSX ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15) #endif +#ifdef HAS_RGB24TOYJROW_LSX +ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15) +#endif +#ifdef HAS_RGB24TOYJROW_LASX +ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31) +#endif #ifdef HAS_RGB24TOYROW_LASX ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31) #endif @@ -1079,6 +1195,12 @@ ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15) #ifdef HAS_RAWTOYROW_LASX ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31) #endif +#ifdef HAS_RAWTOYJROW_LSX +ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15) +#endif +#ifdef HAS_RAWTOYJROW_LASX +ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31) +#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif @@ -1115,12 +1237,18 @@ ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) #ifdef HAS_YUY2TOYROW_MSA ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOYROW_LSX +ANY11(YUY2ToYRow_Any_LSX, YUY2ToYRow_LSX, 1, 4, 1, 15) +#endif #ifdef HAS_YUY2TOYROW_LASX ANY11(YUY2ToYRow_Any_LASX, YUY2ToYRow_LASX, 1, 4, 1, 31) #endif #ifdef HAS_UYVYTOYROW_MSA ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_UYVYTOYROW_LSX +ANY11(UYVYToYRow_Any_LSX, UYVYToYRow_LSX, 1, 4, 1, 15) +#endif #ifdef HAS_UYVYTOYROW_LASX ANY11(UYVYToYRow_Any_LASX, UYVYToYRow_LASX, 1, 4, 1, 31) #endif @@ -1217,6 +1345,9 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_MSA ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBATTENUATEROW_LSX +ANY11(ARGBAttenuateRow_Any_LSX, ARGBAttenuateRow_LSX, 0, 4, 4, 7) +#endif #ifdef HAS_ARGBATTENUATEROW_LASX ANY11(ARGBAttenuateRow_Any_LASX, ARGBAttenuateRow_LASX, 0, 4, 4, 15) #endif @@ -1238,19 +1369,21 @@ ANY11(ARGBExtractAlphaRow_Any_LSX, ARGBExtractAlphaRow_LSX, 0, 4, 1, 15) #undef ANY11 // Any 1 to 1 blended. Destination is read, modify, write. -#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 2]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 64, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ +#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[64]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + memset(vout, 0, sizeof(vout)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + memcpy(vout, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(vin, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ } #ifdef HAS_ARGBCOPYALPHAROW_AVX2 @@ -1270,16 +1403,17 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) // Any 1 to 1 with parameter. #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[64]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, param, n); \ } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp, temp + 64, param, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + memcpy(vin, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(vin, vout, param, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ } #if defined(HAS_I400TOARGBROW_SSE2) @@ -1355,6 +1489,14 @@ ANY11P(ARGBToRGB565DitherRow_Any_MSA, 2, 7) #endif +#if defined(HAS_ARGBTORGB565DITHERROW_LSX) +ANY11P(ARGBToRGB565DitherRow_Any_LSX, + ARGBToRGB565DitherRow_LSX, + const uint32_t, + 4, + 2, + 7) +#endif #if defined(HAS_ARGBTORGB565DITHERROW_LASX) ANY11P(ARGBToRGB565DitherRow_Any_LASX, ARGBToRGB565DitherRow_LASX, @@ -1375,6 +1517,9 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) #ifdef HAS_ARGBSHUFFLEROW_MSA ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #endif +#ifdef HAS_ARGBSHUFFLEROW_LSX +ANY11P(ARGBShuffleRow_Any_LSX, ARGBShuffleRow_LSX, const uint8_t*, 4, 4, 7) +#endif #ifdef HAS_ARGBSHUFFLEROW_LASX ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15) #endif @@ -1384,17 +1529,17 @@ ANY11P(ARGBShuffleRow_Any_LASX, ARGBShuffleRow_LASX, const uint8_t*, 4, 4, 15) // Any 1 to 1 with type #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]); \ - SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]); \ - memset(temp, 0, (MASK + 1) * SBPP); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]); \ + SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, n); \ } \ - memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \ - ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1); \ - memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP); \ + memcpy(vin, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \ + ANY_SIMD((STYPE*)vin, (DTYPE*)vout, MASK + 1); \ + memcpy((uint8_t*)(dst_ptr) + n * BPP, vout, r * BPP); \ } #ifdef HAS_ARGBTOAR64ROW_SSSE3 @@ -1450,17 +1595,17 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) // Any 1 to 1 with parameter and shorts. BPP measures in shorts. #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ - SIMD_ALIGNED(STYPE temp[32]); \ - SIMD_ALIGNED(DTYPE out[32]); \ - memset(temp, 0, 32 * SBPP); /* for msan */ \ + SIMD_ALIGNED(STYPE vin[32]); \ + SIMD_ALIGNED(DTYPE vout[32]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, scale, n); \ } \ - memcpy(temp, src_ptr + n, r * SBPP); \ - ANY_SIMD(temp, out, scale, MASK + 1); \ - memcpy(dst_ptr + n, out, r * BPP); \ + memcpy(vin, src_ptr + n, r * SBPP); \ + ANY_SIMD(vin, vout, scale, MASK + 1); \ + memcpy(dst_ptr + n, vout, r * BPP); \ } #ifdef HAS_CONVERT16TO8ROW_SSSE3 @@ -1537,17 +1682,17 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15) // Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ - SIMD_ALIGNED(ST temp[32]); \ - SIMD_ALIGNED(T out[32]); \ - memset(temp, 0, SBPP * 32); /* for msan */ \ + SIMD_ALIGNED(ST vin[32]); \ + SIMD_ALIGNED(T vout[32]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, param, n); \ } \ - memcpy(temp, src_ptr + n, r * SBPP); \ - ANY_SIMD(temp, out, param, MASK + 1); \ - memcpy(dst_ptr + n, out, r * BPP); \ + memcpy(vin, src_ptr + n, r * SBPP); \ + ANY_SIMD(vin, vout, param, MASK + 1); \ + memcpy(dst_ptr + n, vout, r * BPP); \ } #ifdef HAS_HALFFLOATROW_SSE2 @@ -1588,20 +1733,22 @@ ANY11P16(HalfFloatRow_Any_LSX, HalfFloatRow_LSX, uint16_t, uint16_t, 2, 2, 31) #undef ANY11P16 // Any 1 to 1 with yuvconstants -#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(vin, vout, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ } + #if defined(HAS_YUY2TOARGBROW_SSSE3) ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) @@ -1628,21 +1775,21 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7) #define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ int width, int source_y_fraction) { \ - SIMD_ALIGNED(TS temps[64 * 2]); \ - SIMD_ALIGNED(TD tempd[64]); \ - memset(temps, 0, sizeof(temps)); /* for msan */ \ + SIMD_ALIGNED(TS vin[64 * 2]); \ + SIMD_ALIGNED(TD vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ } \ - memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ if (source_y_fraction) { \ - memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \ + memcpy(vin + 64, src_ptr + src_stride + n * SBPP, \ r * SBPP * sizeof(TS)); \ } \ - ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \ + ANY_SIMD(vout, vin, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD)); \ } #ifdef HAS_INTERPOLATEROW_AVX2 @@ -1682,21 +1829,21 @@ ANY11I(InterpolateRow_16_Any_NEON, #define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK) \ void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \ int scale, int width, int source_y_fraction) { \ - SIMD_ALIGNED(TS temps[64 * 2]); \ - SIMD_ALIGNED(TD tempd[64]); \ - memset(temps, 0, sizeof(temps)); /* for msan */ \ + SIMD_ALIGNED(TS vin[64 * 2]); \ + SIMD_ALIGNED(TD vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \ } \ - memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ + memcpy(vin, src_ptr + n * SBPP, r * SBPP * sizeof(TS)); \ if (source_y_fraction) { \ - memcpy(temps + 64, src_ptr + src_stride + n * SBPP, \ + memcpy(vin + 64, src_ptr + src_stride + n * SBPP, \ r * SBPP * sizeof(TS)); \ } \ - ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD)); \ + ANY_SIMD(vout, vin, 64, scale, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP * sizeof(TD)); \ } #ifdef HAS_INTERPOLATEROW_16TO8_NEON @@ -1721,18 +1868,19 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2, #undef ANY11IS // Any 1 to 1 mirror. -#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr, r* BPP); \ - ANY_SIMD(temp, temp + 64, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t vin[64]); \ + SIMD_ALIGNED(uint8_t vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ + } \ + memcpy(vin, src_ptr, r* BPP); \ + ANY_SIMD(vin, vout, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \ } #ifdef HAS_MIRRORROW_AVX2 @@ -1747,6 +1895,9 @@ ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) #ifdef HAS_MIRRORROW_MSA ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #endif +#ifdef HAS_MIRRORROW_LSX +ANY11M(MirrorRow_Any_LSX, MirrorRow_LSX, 1, 31) +#endif #ifdef HAS_MIRRORROW_LASX ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63) #endif @@ -1762,6 +1913,9 @@ ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) #ifdef HAS_MIRRORUVROW_MSA ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7) #endif +#ifdef HAS_MIRRORUVROW_LSX +ANY11M(MirrorUVRow_Any_LSX, MirrorUVRow_LSX, 2, 7) +#endif #ifdef HAS_MIRRORUVROW_LASX ANY11M(MirrorUVRow_Any_LASX, MirrorUVRow_LASX, 2, 15) #endif @@ -1777,6 +1931,9 @@ ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #endif +#ifdef HAS_ARGBMIRRORROW_LSX +ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7) +#endif #ifdef HAS_ARGBMIRRORROW_LASX ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15) #endif @@ -1791,15 +1948,14 @@ ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) // Any 1 plane. (memset) #define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ - SIMD_ALIGNED(uint8_t temp[64]); \ - memset(temp, 0, 64); /* for msan */ \ + SIMD_ALIGNED(uint8_t vout[64]); \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(dst_ptr, v32, n); \ } \ - ANY_SIMD(temp, v32, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp, r * BPP); \ + ANY_SIMD(vout, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, vout, r * BPP); \ } #ifdef HAS_SETROW_X86 @@ -1823,20 +1979,21 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3) #undef ANY1 // Any 1 to 2. Outputs UV planes. -#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ - int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 3]); \ - memset(temp, 0, 128); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ - memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ - memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ + int width) { \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + ANY_SIMD(vin, vout, vout + 128, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), vout, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), vout + 128, SS(r, DUVSHIFT)); \ } #ifdef HAS_SPLITUVROW_SSE2 @@ -1875,6 +2032,11 @@ ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) #endif +#ifdef HAS_YUY2TOUV422ROW_LSX +ANY12(ARGBToUV444Row_Any_LSX, ARGBToUV444Row_LSX, 0, 4, 0, 15) +ANY12(YUY2ToUV422Row_Any_LSX, YUY2ToUV422Row_LSX, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_LSX, UYVYToUV422Row_LSX, 1, 4, 1, 15) +#endif #ifdef HAS_YUY2TOUV422ROW_LASX ANY12(ARGBToUV444Row_Any_LASX, ARGBToUV444Row_LASX, 0, 4, 0, 31) ANY12(YUY2ToUV422Row_Any_LASX, YUY2ToUV422Row_LASX, 1, 4, 1, 31) @@ -1885,17 +2047,18 @@ ANY12(UYVYToUV422Row_Any_LASX, UYVYToUV422Row_LASX, 1, 4, 1, 31) // Any 2 16 bit planes with parameter to 1 #define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \ - SIMD_ALIGNED(T temp[16 * 4]); \ - memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ + SIMD_ALIGNED(T vin[16 * 2]); \ + SIMD_ALIGNED(T vout[16 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_uv, dst_u, dst_v, depth, n); \ } \ - memcpy(temp, src_uv + n * 2, r * BPP * 2); \ - ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1); \ - memcpy(dst_u + n, temp + 32, r * BPP); \ - memcpy(dst_v + n, temp + 48, r * BPP); \ + memcpy(vin, src_uv + n * 2, r * BPP * 2); \ + ANY_SIMD(vin, vout, vout + 16, depth, MASK + 1); \ + memcpy(dst_u + n, vout, r * BPP); \ + memcpy(dst_v + n, vout + 16, r * BPP); \ } #ifdef HAS_SPLITUVROW_16_AVX2 @@ -1909,21 +2072,22 @@ ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7) #undef ANY21CT // Any 1 to 3. Outputs RGB planes. -#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ - uint8_t* dst_b, int width) { \ - SIMD_ALIGNED(uint8_t temp[16 * 6]); \ - memset(temp, 0, 16 * 3); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ - } \ - memcpy(temp, src_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ - memcpy(dst_r + n, temp + 16 * 3, r); \ - memcpy(dst_g + n, temp + 16 * 4, r); \ - memcpy(dst_b + n, temp + 16 * 5, r); \ +#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, int width) { \ + SIMD_ALIGNED(uint8_t vin[16 * 3]); \ + SIMD_ALIGNED(uint8_t vout[16 * 3]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ + } \ + memcpy(vin, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(vin, vout, vout + 16, vout + 32, MASK + 1); \ + memcpy(dst_r + n, vout, r); \ + memcpy(dst_g + n, vout + 16, r); \ + memcpy(dst_b + n, vout + 32, r); \ } #ifdef HAS_SPLITRGBROW_SSSE3 @@ -1946,23 +2110,23 @@ ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) #endif // Any 1 to 4. Outputs ARGB planes. -#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ - uint8_t* dst_b, uint8_t* dst_a, int width) { \ - SIMD_ALIGNED(uint8_t temp[16 * 8]); \ - memset(temp, 0, 16 * 4); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \ - } \ - memcpy(temp, src_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \ - MASK + 1); \ - memcpy(dst_r + n, temp + 16 * 4, r); \ - memcpy(dst_g + n, temp + 16 * 5, r); \ - memcpy(dst_b + n, temp + 16 * 6, r); \ - memcpy(dst_a + n, temp + 16 * 7, r); \ +#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, uint8_t* dst_a, int width) { \ + SIMD_ALIGNED(uint8_t vin[16 * 4]); \ + SIMD_ALIGNED(uint8_t vout[16 * 4]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \ + } \ + memcpy(vin, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(vin, vout, vout + 16, vout + 32, vout + 48, MASK + 1); \ + memcpy(dst_r + n, vout, r); \ + memcpy(dst_g + n, vout + 16, r); \ + memcpy(dst_b + n, vout + 32, r); \ + memcpy(dst_a + n, vout + 48, r); \ } #ifdef HAS_SPLITARGBROW_SSE2 @@ -1983,25 +2147,26 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ uint8_t* dst_v, int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 4]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ - memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ + memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \ BPP); \ - memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ - temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \ + vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ } \ - ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ - memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ - memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ + ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1); \ + memcpy(dst_u + (n >> 1), vout, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), vout + 128, SS(r, 1)); \ } #ifdef HAS_ARGBTOUVROW_AVX2 @@ -2013,9 +2178,17 @@ ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) #ifdef HAS_ARGBTOUVJROW_AVX2 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ABGRTOUVJROW_AVX2 +ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVJROW_SSSE3 +ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVJROW_SSSE3 +ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVROW_SSSE3 ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) -ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) @@ -2034,12 +2207,18 @@ ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVROW_LSX +ANY12S(ARGBToUVRow_Any_LSX, ARGBToUVRow_LSX, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVROW_LASX ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) #endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVJROW_NEON +ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif @@ -2142,12 +2321,18 @@ ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #ifdef HAS_YUY2TOUVROW_MSA ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_YUY2TOUVROW_LSX +ANY12S(YUY2ToUVRow_Any_LSX, YUY2ToUVRow_LSX, 1, 4, 15) +#endif #ifdef HAS_YUY2TOUVROW_LASX ANY12S(YUY2ToUVRow_Any_LASX, YUY2ToUVRow_LASX, 1, 4, 31) #endif #ifdef HAS_UYVYTOUVROW_MSA ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) #endif +#ifdef HAS_UYVYTOUVROW_LSX +ANY12S(UYVYToUVRow_Any_LSX, UYVYToUVRow_LSX, 1, 4, 15) +#endif #ifdef HAS_UYVYTOUVROW_LASX ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31) #endif @@ -2158,24 +2343,25 @@ ANY12S(UYVYToUVRow_Any_LASX, UYVYToUVRow_LASX, 1, 4, 31) #define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \ int width) { \ - SIMD_ALIGNED(uint8_t temp[128 * 3]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[128 * 2]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, src_stride, dst_vu, n); \ } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ + memcpy(vin, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(vin + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ - memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ + memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \ BPP); \ - memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ - temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \ + vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ } \ - ANY_SIMD(temp, 128, temp + 256, MASK + 1); \ - memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \ + ANY_SIMD(vin, 128, vout, MASK + 1); \ + memcpy(dst_vu + (n >> 1) * 2, vout, SS(r, 1) * 2); \ } #ifdef HAS_AYUVTOVUROW_NEON @@ -2184,42 +2370,53 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) #endif #undef ANY11S -#define ANYDETILE(NAMEANY, ANY_SIMD, MASK) \ - void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \ - int width) { \ - SIMD_ALIGNED(uint8_t temp[16 * 2]); \ - memset(temp, 0, 16); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src, src_tile_stride, dst, n); \ - } \ - memcpy(temp, src + (n / 16) * src_tile_stride, r); \ - ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1); \ - memcpy(dst + n, temp + 16, r); \ +#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(const T* src, ptrdiff_t src_tile_stride, T* dst, int width) { \ + SIMD_ALIGNED(T vin[16]); \ + SIMD_ALIGNED(T vout[16]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src, src_tile_stride, dst, n); \ + } \ + memcpy(vin, src + (n / 16) * src_tile_stride, r * BPP); \ + ANY_SIMD(vin, src_tile_stride, vout, MASK + 1); \ + memcpy(dst + n, vout, r * BPP); \ } #ifdef HAS_DETILEROW_NEON -ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15) +ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15) #endif #ifdef HAS_DETILEROW_SSE2 -ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15) +ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15) +#endif +#ifdef HAS_DETILEROW_16_NEON +ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15) +#endif +#ifdef HAS_DETILEROW_16_SSE2 +ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15) +#endif +#ifdef HAS_DETILEROW_16_AVX +ANYDETILE(DetileRow_16_Any_AVX, DetileRow_16_AVX, uint16_t, 2, 15) #endif +// DetileSplitUVRow width is in bytes #define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \ void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \ uint8_t* dst_u, uint8_t* dst_v, int width) { \ - SIMD_ALIGNED(uint8_t temp[16 * 2]); \ - memset(temp, 0, 16 * 2); /* for msan */ \ + SIMD_ALIGNED(uint8_t vin[16]); \ + SIMD_ALIGNED(uint8_t vout[8 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n); \ } \ - memcpy(temp, src_uv + (n / 16) * src_tile_stride, r); \ - ANY_SIMD(temp, src_tile_stride, temp + 16, temp + 24, r); \ - memcpy(dst_u + n / 2, temp + 16, (r + 1) / 2); \ - memcpy(dst_v + n / 2, temp + 24, (r + 1) / 2); \ + memcpy(vin, src_uv + (n / 16) * src_tile_stride, r); \ + ANY_SIMD(vin, src_tile_stride, vout, vout + 8, r); \ + memcpy(dst_u + n / 2, vout, (r + 1) / 2); \ + memcpy(dst_v + n / 2, vout + 8, (r + 1) / 2); \ } #ifdef HAS_DETILESPLITUVROW_NEON @@ -2229,6 +2426,33 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15) ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) #endif +#define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK) \ + void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, \ + const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride, \ + uint8_t* dst_yuy2, int width) { \ + SIMD_ALIGNED(uint8_t vin[16 * 2]); \ + SIMD_ALIGNED(uint8_t vout[16 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, \ + n); \ + } \ + memcpy(vin, src_y + (n / 16) * src_y_tile_stride, r); \ + memcpy(vin + 16, src_uv + (n / 16) * src_uv_tile_stride, r); \ + ANY_SIMD(vin, src_y_tile_stride, vin + 16, src_uv_tile_stride, vout, r); \ + memcpy(dst_yuy2 + 2 * n, vout, 2 * r); \ + } + +#ifdef HAS_DETILETOYUY2_NEON +ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15) +#endif + +#ifdef HAS_DETILETOYUY2_SSE2 +ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15) +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/row_common.cc b/files/source/row_common.cc index 83442496..8be37fb5 100644 --- a/files/source/row_common.cc +++ b/files/source/row_common.cc @@ -21,6 +21,12 @@ namespace libyuv { extern "C" { #endif +#ifdef __cplusplus +#define STATIC_CAST(type, expr) static_cast(expr) +#else +#define STATIC_CAST(type, expr) (type)(expr) +#endif + // This macro controls YUV to RGB using unsigned math to extend range of // YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: // LIBYUV_UNLIMITED_DATA @@ -182,12 +188,13 @@ void RGB565ToARGBRow_C(const uint8_t* src_rgb565, int width) { int x; for (x = 0; x < width; ++x) { - uint8_t b = src_rgb565[0] & 0x1f; - uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8_t r = src_rgb565[1] >> 3; - dst_argb[0] = (b << 3) | (b >> 2); - dst_argb[1] = (g << 2) | (g >> 4); - dst_argb[2] = (r << 3) | (r >> 2); + uint8_t b = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + dst_argb[1] = STATIC_CAST(uint8_t, (g << 2) | (g >> 4)); + dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); dst_argb[3] = 255u; dst_argb += 4; src_rgb565 += 2; @@ -199,13 +206,14 @@ void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, int width) { int x; for (x = 0; x < width; ++x) { - uint8_t b = src_argb1555[0] & 0x1f; - uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8_t r = (src_argb1555[1] & 0x7c) >> 2; - uint8_t a = src_argb1555[1] >> 7; - dst_argb[0] = (b << 3) | (b >> 2); - dst_argb[1] = (g << 3) | (g >> 2); - dst_argb[2] = (r << 3) | (r >> 2); + uint8_t b = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f); + uint8_t g = STATIC_CAST( + uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3)); + uint8_t r = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2); + uint8_t a = STATIC_CAST(uint8_t, src_argb1555[1] >> 7); + dst_argb[0] = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + dst_argb[1] = STATIC_CAST(uint8_t, (g << 3) | (g >> 2)); + dst_argb[2] = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); dst_argb[3] = -a; dst_argb += 4; src_argb1555 += 2; @@ -217,14 +225,14 @@ void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, int width) { int x; for (x = 0; x < width; ++x) { - uint8_t b = src_argb4444[0] & 0x0f; - uint8_t g = src_argb4444[0] >> 4; - uint8_t r = src_argb4444[1] & 0x0f; - uint8_t a = src_argb4444[1] >> 4; - dst_argb[0] = (b << 4) | b; - dst_argb[1] = (g << 4) | g; - dst_argb[2] = (r << 4) | r; - dst_argb[3] = (a << 4) | a; + uint8_t b = STATIC_CAST(uint8_t, src_argb4444[0] & 0x0f); + uint8_t g = STATIC_CAST(uint8_t, src_argb4444[0] >> 4); + uint8_t r = STATIC_CAST(uint8_t, src_argb4444[1] & 0x0f); + uint8_t a = STATIC_CAST(uint8_t, src_argb4444[1] >> 4); + dst_argb[0] = STATIC_CAST(uint8_t, (b << 4) | b); + dst_argb[1] = STATIC_CAST(uint8_t, (g << 4) | g); + dst_argb[2] = STATIC_CAST(uint8_t, (r << 4) | r); + dst_argb[3] = STATIC_CAST(uint8_t, (a << 4) | a); dst_argb += 4; src_argb4444 += 2; } @@ -320,7 +328,7 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint8_t b0 = src_argb[0] >> 3; uint8_t g0 = src_argb[1] >> 2; uint8_t r0 = src_argb[2] >> 3; - *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); } } @@ -334,29 +342,31 @@ void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { // or the upper byte for big endian. void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { int x; for (x = 0; x < width - 1; x += 2) { int dither0 = ((const unsigned char*)(&dither4))[x & 3]; int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; - uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; - uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3; - uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2; - uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3; - *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 11); - *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 11); + uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3); + uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2); + uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3); + uint8_t b1 = STATIC_CAST(uint8_t, clamp255(src_argb[4] + dither1) >> 3); + uint8_t g1 = STATIC_CAST(uint8_t, clamp255(src_argb[5] + dither1) >> 2); + uint8_t r1 = STATIC_CAST(uint8_t, clamp255(src_argb[6] + dither1) >> 3); + *(uint16_t*)(dst_rgb + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); + *(uint16_t*)(dst_rgb + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11)); dst_rgb += 4; src_argb += 8; } if (width & 1) { int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; - uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; - *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + uint8_t b0 = STATIC_CAST(uint8_t, clamp255(src_argb[0] + dither0) >> 3); + uint8_t g0 = STATIC_CAST(uint8_t, clamp255(src_argb[1] + dither0) >> 2); + uint8_t r0 = STATIC_CAST(uint8_t, clamp255(src_argb[2] + dither0) >> 3); + *(uint16_t*)(dst_rgb) = STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); } } @@ -371,8 +381,10 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint8_t g1 = src_argb[5] >> 3; uint8_t r1 = src_argb[6] >> 3; uint8_t a1 = src_argb[7] >> 7; - *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); - *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 10) | (a1 << 15); + *(uint16_t*)(dst_rgb + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15)); + *(uint16_t*)(dst_rgb + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | (a1 << 15)); dst_rgb += 4; src_argb += 8; } @@ -381,7 +393,8 @@ void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint8_t g0 = src_argb[1] >> 3; uint8_t r0 = src_argb[2] >> 3; uint8_t a0 = src_argb[3] >> 7; - *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + *(uint16_t*)(dst_rgb) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | (a0 << 15)); } } @@ -396,8 +409,10 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint8_t g1 = src_argb[5] >> 4; uint8_t r1 = src_argb[6] >> 4; uint8_t a1 = src_argb[7] >> 4; - *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); - *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 4) | (r1 << 8) | (a1 << 12); + *(uint16_t*)(dst_rgb + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12)); + *(uint16_t*)(dst_rgb + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | (a1 << 12)); dst_rgb += 4; src_argb += 8; } @@ -406,18 +421,20 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint8_t g0 = src_argb[1] >> 4; uint8_t r0 = src_argb[2] >> 4; uint8_t a0 = src_argb[3] >> 4; - *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + *(uint16_t*)(dst_rgb) = + STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | (a0 << 12)); } } void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { int x; for (x = 0; x < width; ++x) { - uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); + uint32_t r0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); - uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); + uint32_t b0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); uint32_t a0 = (src_abgr[3] >> 6); - *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30); + *(uint32_t*)(dst_ar30) = + STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30)); dst_ar30 += 4; src_abgr += 4; } @@ -430,7 +447,8 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); uint32_t a0 = (src_argb[3] >> 6); - *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); + *(uint32_t*)(dst_ar30) = + STATIC_CAST(uint32_t, b0 | (g0 << 10) | (r0 << 20) | (a0 << 30)); dst_ar30 += 4; src_argb += 4; } @@ -439,10 +457,14 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { int x; for (x = 0; x < width; ++x) { - dst_ar64[0] = src_argb[0] * 0x0101; - dst_ar64[1] = src_argb[1] * 0x0101; - dst_ar64[2] = src_argb[2] * 0x0101; - dst_ar64[3] = src_argb[3] * 0x0101; + uint16_t b = src_argb[0] * 0x0101; + uint16_t g = src_argb[1] * 0x0101; + uint16_t r = src_argb[2] * 0x0101; + uint16_t a = src_argb[3] * 0x0101; + dst_ar64[0] = b; + dst_ar64[1] = g; + dst_ar64[2] = r; + dst_ar64[3] = a; dst_ar64 += 4; src_argb += 4; } @@ -451,10 +473,14 @@ void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { int x; for (x = 0; x < width; ++x) { - dst_ab64[0] = src_argb[2] * 0x0101; - dst_ab64[1] = src_argb[1] * 0x0101; - dst_ab64[2] = src_argb[0] * 0x0101; - dst_ab64[3] = src_argb[3] * 0x0101; + uint16_t b = src_argb[0] * 0x0101; + uint16_t g = src_argb[1] * 0x0101; + uint16_t r = src_argb[2] * 0x0101; + uint16_t a = src_argb[3] * 0x0101; + dst_ab64[0] = r; + dst_ab64[1] = g; + dst_ab64[2] = b; + dst_ab64[3] = a; dst_ab64 += 4; src_argb += 4; } @@ -463,10 +489,14 @@ void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - dst_argb[0] = src_ar64[0] >> 8; - dst_argb[1] = src_ar64[1] >> 8; - dst_argb[2] = src_ar64[2] >> 8; - dst_argb[3] = src_ar64[3] >> 8; + uint8_t b = src_ar64[0] >> 8; + uint8_t g = src_ar64[1] >> 8; + uint8_t r = src_ar64[2] >> 8; + uint8_t a = src_ar64[3] >> 8; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; dst_argb += 4; src_ar64 += 4; } @@ -475,10 +505,14 @@ void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - dst_argb[0] = src_ab64[2] >> 8; - dst_argb[1] = src_ab64[1] >> 8; - dst_argb[2] = src_ab64[0] >> 8; - dst_argb[3] = src_ab64[3] >> 8; + uint8_t r = src_ab64[0] >> 8; + uint8_t g = src_ab64[1] >> 8; + uint8_t b = src_ab64[2] >> 8; + uint8_t a = src_ab64[3] >> 8; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; dst_argb += 4; src_ab64 += 4; } @@ -514,8 +548,8 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64, #ifdef LIBYUV_RGB7 // Old 7 bit math for compatibility on unsupported platforms. -static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { - return ((33 * r + 65 * g + 13 * b) >> 7) + 16; +static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16); } #else // 8 bit @@ -524,8 +558,8 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { // return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) + // 0x7e80) >> 8; -static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { - return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); } #endif @@ -533,29 +567,31 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { // LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round. #ifdef LIBYUV_RGBTOU_TRUNCATE -static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { - return (112 * b - 74 * g - 38 * r + 0x8000) >> 8; +static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8); } -static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { - return (112 * r - 94 * g - 18 * b + 0x8000) >> 8; +static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8); } #else // TODO(fbarchard): Add rounding to x86 SIMD and use this -static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { - return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; +static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); } -static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { - return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; +static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8); } #endif // LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb. #if !defined(LIBYUV_ARGBTOUV_PAVGB) static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { - return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8; + return STATIC_CAST( + uint8_t, ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8); } static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { - return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8; + return STATIC_CAST( + uint8_t, ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8); } #endif @@ -674,28 +710,28 @@ MAKEROWY(RAW, 0, 1, 2, 3) #ifdef LIBYUV_RGB7 // Old 7 bit math for compatibility on unsupported platforms. -static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { +static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { return (38 * r + 75 * g + 15 * b + 64) >> 7; } #else // 8 bit -static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { +static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { return (77 * r + 150 * g + 29 * b + 128) >> 8; } #endif #if defined(LIBYUV_ARGBTOUV_PAVGB) -static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { +static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; } -static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { +static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } #else -static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { +static __inline uint8_t RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; } -static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { +static __inline uint8_t RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; } #endif @@ -782,6 +818,7 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { #endif MAKEROWYJ(ARGB, 2, 1, 0, 4) +MAKEROWYJ(ABGR, 0, 1, 2, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4) MAKEROWYJ(RGB24, 2, 1, 0, 3) MAKEROWYJ(RAW, 0, 1, 2, 3) @@ -791,11 +828,12 @@ void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_rgb565[0] & 0x1f; - uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t g = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); uint8_t r = src_rgb565[1] >> 3; - b = (b << 3) | (b >> 2); - g = (g << 2) | (g >> 4); - r = (r << 3) | (r >> 2); + b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + g = STATIC_CAST(uint8_t, (g << 2) | (g >> 4)); + r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); dst_y[0] = RGBToY(r, g, b); src_rgb565 += 2; dst_y += 1; @@ -806,11 +844,12 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_argb1555[0] & 0x1f; - uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t g = STATIC_CAST( + uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3)); uint8_t r = (src_argb1555[1] & 0x7c) >> 2; - b = (b << 3) | (b >> 2); - g = (g << 3) | (g >> 2); - r = (r << 3) | (r >> 2); + b = STATIC_CAST(uint8_t, (b << 3) | (b >> 2)); + g = STATIC_CAST(uint8_t, (g << 3) | (g >> 2)); + r = STATIC_CAST(uint8_t, (r << 3) | (r >> 2)); dst_y[0] = RGBToY(r, g, b); src_argb1555 += 2; dst_y += 1; @@ -823,9 +862,9 @@ void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { uint8_t b = src_argb4444[0] & 0x0f; uint8_t g = src_argb4444[0] >> 4; uint8_t r = src_argb4444[1] & 0x0f; - b = (b << 4) | b; - g = (g << 4) | g; - r = (r << 4) | r; + b = STATIC_CAST(uint8_t, (b << 4) | b); + g = STATIC_CAST(uint8_t, (g << 4) | g); + r = STATIC_CAST(uint8_t, (r << 4) | r); dst_y[0] = RGBToY(r, g, b); src_argb4444 += 2; dst_y += 1; @@ -840,31 +879,35 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; int x; for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = src_rgb565[0] & 0x1f; - uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8_t r0 = src_rgb565[1] >> 3; - uint8_t b1 = src_rgb565[2] & 0x1f; - uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); - uint8_t r1 = src_rgb565[3] >> 3; - uint8_t b2 = next_rgb565[0] & 0x1f; - uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8_t r2 = next_rgb565[1] >> 3; - uint8_t b3 = next_rgb565[2] & 0x1f; - uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); - uint8_t r3 = next_rgb565[3] >> 3; - - b0 = (b0 << 3) | (b0 >> 2); - g0 = (g0 << 2) | (g0 >> 4); - r0 = (r0 << 3) | (r0 >> 2); - b1 = (b1 << 3) | (b1 >> 2); - g1 = (g1 << 2) | (g1 >> 4); - r1 = (r1 << 3) | (r1 >> 2); - b2 = (b2 << 3) | (b2 >> 2); - g2 = (g2 << 2) | (g2 >> 4); - r2 = (r2 << 3) | (r2 >> 2); - b3 = (b3 << 3) | (b3 >> 2); - g3 = (g3 << 2) | (g3 >> 4); - r3 = (r3 << 3) | (r3 >> 2); + uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + uint8_t b1 = STATIC_CAST(uint8_t, src_rgb565[2] & 0x1f); + uint8_t g1 = STATIC_CAST( + uint8_t, (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3)); + uint8_t r1 = STATIC_CAST(uint8_t, src_rgb565[3] >> 3); + uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3); + uint8_t b3 = STATIC_CAST(uint8_t, next_rgb565[2] & 0x1f); + uint8_t g3 = STATIC_CAST( + uint8_t, (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3)); + uint8_t r3 = STATIC_CAST(uint8_t, next_rgb565[3] >> 3); + + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2)); + g1 = STATIC_CAST(uint8_t, (g1 << 2) | (g1 >> 4)); + r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); + b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2)); + g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4)); + r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); @@ -886,19 +929,20 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, dst_v += 1; } if (width & 1) { - uint8_t b0 = src_rgb565[0] & 0x1f; - uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8_t r0 = src_rgb565[1] >> 3; - uint8_t b2 = next_rgb565[0] & 0x1f; - uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8_t r2 = next_rgb565[1] >> 3; - - b0 = (b0 << 3) | (b0 >> 2); - g0 = (g0 << 2) | (g0 >> 4); - r0 = (r0 << 3) | (r0 >> 2); - b2 = (b2 << 3) | (b2 >> 2); - g2 = (g2 << 2) | (g2 >> 4); - r2 = (r2 << 3) | (r2 >> 2); + uint8_t b0 = STATIC_CAST(uint8_t, src_rgb565[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, src_rgb565[1] >> 3); + uint8_t b2 = STATIC_CAST(uint8_t, next_rgb565[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, next_rgb565[1] >> 3); + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 2) | (g0 >> 4)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(b0, b2); @@ -924,31 +968,35 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; int x; for (x = 0; x < width - 1; x += 2) { - uint8_t b0 = src_argb1555[0] & 0x1f; - uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8_t b1 = src_argb1555[2] & 0x1f; - uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); - uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2; - uint8_t b2 = next_argb1555[0] & 0x1f; - uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; - uint8_t b3 = next_argb1555[2] & 0x1f; - uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); - uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; - - b0 = (b0 << 3) | (b0 >> 2); - g0 = (g0 << 3) | (g0 >> 2); - r0 = (r0 << 3) | (r0 >> 2); - b1 = (b1 << 3) | (b1 >> 2); - g1 = (g1 << 3) | (g1 >> 2); - r1 = (r1 << 3) | (r1 >> 2); - b2 = (b2 << 3) | (b2 >> 2); - g2 = (g2 << 3) | (g2 >> 2); - r2 = (r2 << 3) | (r2 >> 2); - b3 = (b3 << 3) | (b3 >> 2); - g3 = (g3 << 3) | (g3 >> 2); - r3 = (r3 << 3) | (r3 >> 2); + uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2); + uint8_t b1 = STATIC_CAST(uint8_t, src_argb1555[2] & 0x1f); + uint8_t g1 = STATIC_CAST( + uint8_t, (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3)); + uint8_t r1 = STATIC_CAST(uint8_t, (src_argb1555[3] & 0x7c) >> 2); + uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2); + uint8_t b3 = STATIC_CAST(uint8_t, next_argb1555[2] & 0x1f); + uint8_t g3 = STATIC_CAST( + uint8_t, (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3)); + uint8_t r3 = STATIC_CAST(uint8_t, (next_argb1555[3] & 0x7c) >> 2); + + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b1 = STATIC_CAST(uint8_t, (b1 << 3) | (b1 >> 2)); + g1 = STATIC_CAST(uint8_t, (g1 << 3) | (g1 >> 2)); + r1 = STATIC_CAST(uint8_t, (r1 << 3) | (r1 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); + b3 = STATIC_CAST(uint8_t, (b3 << 3) | (b3 >> 2)); + g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2)); + r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); @@ -970,19 +1018,21 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, dst_v += 1; } if (width & 1) { - uint8_t b0 = src_argb1555[0] & 0x1f; - uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8_t b2 = next_argb1555[0] & 0x1f; - uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; - - b0 = (b0 << 3) | (b0 >> 2); - g0 = (g0 << 3) | (g0 >> 2); - r0 = (r0 << 3) | (r0 >> 2); - b2 = (b2 << 3) | (b2 >> 2); - g2 = (g2 << 3) | (g2 >> 2); - r2 = (r2 << 3) | (r2 >> 2); + uint8_t b0 = STATIC_CAST(uint8_t, src_argb1555[0] & 0x1f); + uint8_t g0 = STATIC_CAST( + uint8_t, (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3)); + uint8_t r0 = STATIC_CAST(uint8_t, (src_argb1555[1] & 0x7c) >> 2); + uint8_t b2 = STATIC_CAST(uint8_t, next_argb1555[0] & 0x1f); + uint8_t g2 = STATIC_CAST( + uint8_t, (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3)); + uint8_t r2 = STATIC_CAST(uint8_t, (next_argb1555[1] & 0x7c) >> 2); + + b0 = STATIC_CAST(uint8_t, (b0 << 3) | (b0 >> 2)); + g0 = STATIC_CAST(uint8_t, (g0 << 3) | (g0 >> 2)); + r0 = STATIC_CAST(uint8_t, (r0 << 3) | (r0 >> 2)); + b2 = STATIC_CAST(uint8_t, (b2 << 3) | (b2 >> 2)); + g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2)); + r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(b0, b2); @@ -1021,18 +1071,18 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, uint8_t g3 = next_argb4444[2] >> 4; uint8_t r3 = next_argb4444[3] & 0x0f; - b0 = (b0 << 4) | b0; - g0 = (g0 << 4) | g0; - r0 = (r0 << 4) | r0; - b1 = (b1 << 4) | b1; - g1 = (g1 << 4) | g1; - r1 = (r1 << 4) | r1; - b2 = (b2 << 4) | b2; - g2 = (g2 << 4) | g2; - r2 = (r2 << 4) | r2; - b3 = (b3 << 4) | b3; - g3 = (g3 << 4) | g3; - r3 = (r3 << 4) | r3; + b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0); + g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0); + r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0); + b1 = STATIC_CAST(uint8_t, (b1 << 4) | b1); + g1 = STATIC_CAST(uint8_t, (g1 << 4) | g1); + r1 = STATIC_CAST(uint8_t, (r1 << 4) | r1); + b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2); + g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2); + r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2); + b3 = STATIC_CAST(uint8_t, (b3 << 4) | b3); + g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3); + r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); @@ -1061,12 +1111,12 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, uint8_t g2 = next_argb4444[0] >> 4; uint8_t r2 = next_argb4444[1] & 0x0f; - b0 = (b0 << 4) | b0; - g0 = (g0 << 4) | g0; - r0 = (r0 << 4) | r0; - b2 = (b2 << 4) | b2; - g2 = (g2 << 4) | g2; - r2 = (r2 << 4) | r2; + b0 = STATIC_CAST(uint8_t, (b0 << 4) | b0); + g0 = STATIC_CAST(uint8_t, (g0 << 4) | g0); + r0 = STATIC_CAST(uint8_t, (r0 << 4) | r0); + b2 = STATIC_CAST(uint8_t, (b2 << 4) | b2); + g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2); + r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(b0, b2); @@ -1123,9 +1173,9 @@ void ARGBSepiaRow_C(uint8_t* dst_argb, int width) { int sg = (b * 22 + g * 88 + r * 45) >> 7; int sr = (b * 24 + g * 98 + r * 50) >> 7; // b does not over flow. a is preserved from original. - dst_argb[0] = sb; - dst_argb[1] = clamp255(sg); - dst_argb[2] = clamp255(sr); + dst_argb[0] = STATIC_CAST(uint8_t, sb); + dst_argb[1] = STATIC_CAST(uint8_t, clamp255(sg)); + dst_argb[2] = STATIC_CAST(uint8_t, clamp255(sr)); dst_argb += 4; } } @@ -1154,10 +1204,10 @@ void ARGBColorMatrixRow_C(const uint8_t* src_argb, int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] + a * matrix_argb[15]) >> 6; - dst_argb[0] = Clamp(sb); - dst_argb[1] = Clamp(sg); - dst_argb[2] = Clamp(sr); - dst_argb[3] = Clamp(sa); + dst_argb[0] = STATIC_CAST(uint8_t, Clamp(sb)); + dst_argb[1] = STATIC_CAST(uint8_t, Clamp(sg)); + dst_argb[2] = STATIC_CAST(uint8_t, Clamp(sr)); + dst_argb[3] = STATIC_CAST(uint8_t, Clamp(sa)); src_argb += 4; dst_argb += 4; } @@ -1207,9 +1257,12 @@ void ARGBQuantizeRow_C(uint8_t* dst_argb, int b = dst_argb[0]; int g = dst_argb[1]; int r = dst_argb[2]; - dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; - dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset; - dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset; + dst_argb[0] = STATIC_CAST( + uint8_t, (b * scale >> 16) * interval_size + interval_offset); + dst_argb[1] = STATIC_CAST( + uint8_t, (g * scale >> 16) * interval_size + interval_offset); + dst_argb[2] = STATIC_CAST( + uint8_t, (r * scale >> 16) * interval_size + interval_offset); dst_argb += 4; } } @@ -1260,10 +1313,10 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint32_t g_scale = src_argb1[1]; const uint32_t r_scale = src_argb1[2]; const uint32_t a_scale = src_argb1[3]; - dst_argb[0] = SHADE(b, b_scale); - dst_argb[1] = SHADE(g, g_scale); - dst_argb[2] = SHADE(r, r_scale); - dst_argb[3] = SHADE(a, a_scale); + dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_scale)); + dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_scale)); + dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_scale)); + dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_scale)); src_argb += 4; src_argb1 += 4; dst_argb += 4; @@ -1288,10 +1341,10 @@ void ARGBAddRow_C(const uint8_t* src_argb, const int g_add = src_argb1[1]; const int r_add = src_argb1[2]; const int a_add = src_argb1[3]; - dst_argb[0] = SHADE(b, b_add); - dst_argb[1] = SHADE(g, g_add); - dst_argb[2] = SHADE(r, r_add); - dst_argb[3] = SHADE(a, a_add); + dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_add)); + dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_add)); + dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_add)); + dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_add)); src_argb += 4; src_argb1 += 4; dst_argb += 4; @@ -1315,10 +1368,10 @@ void ARGBSubtractRow_C(const uint8_t* src_argb, const int g_sub = src_argb1[1]; const int r_sub = src_argb1[2]; const int a_sub = src_argb1[3]; - dst_argb[0] = SHADE(b, b_sub); - dst_argb[1] = SHADE(g, g_sub); - dst_argb[2] = SHADE(r, r_sub); - dst_argb[3] = SHADE(a, a_sub); + dst_argb[0] = STATIC_CAST(uint8_t, SHADE(b, b_sub)); + dst_argb[1] = STATIC_CAST(uint8_t, SHADE(g, g_sub)); + dst_argb[2] = STATIC_CAST(uint8_t, SHADE(r, r_sub)); + dst_argb[3] = STATIC_CAST(uint8_t, SHADE(a, a_sub)); src_argb += 4; src_argb1 += 4; dst_argb += 4; @@ -1431,7 +1484,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // clang-format off -#if defined(__aarch64__) || defined(__arm__) +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) // Bias values include subtract 128 from U and V, bias from Y and rounding. // For B and R bias is negative. For G bias is positive. #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ @@ -1627,7 +1680,7 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) #undef MAKEYUVCONSTANTS -#if defined(__aarch64__) || defined(__arm__) +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) #define LOAD_YUV_CONSTANTS \ int ub = yuvconstants->kUVCoeff[0]; \ int vr = yuvconstants->kUVCoeff[1]; \ @@ -1675,9 +1728,9 @@ static __inline void YuvPixel(uint8_t y, LOAD_YUV_CONSTANTS; uint32_t y32 = y * 0x0101; CALC_RGB16; - *b = Clamp((int32_t)(b16) >> 6); - *g = Clamp((int32_t)(g16) >> 6); - *r = Clamp((int32_t)(r16) >> 6); + *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6)); + *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6)); + *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6)); } // Reads 8 bit YUV and leaves result as 16 bit. @@ -1706,9 +1759,9 @@ static __inline void YuvPixel10_16(uint16_t y, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - uint32_t y32 = y << 6; - u = clamp255(u >> 2); - v = clamp255(v >> 2); + uint32_t y32 = (y << 6) | (y >> 4); + u = STATIC_CAST(uint8_t, clamp255(u >> 2)); + v = STATIC_CAST(uint8_t, clamp255(v >> 2)); CALC_RGB16; *b = b16; *g = g16; @@ -1725,9 +1778,9 @@ static __inline void YuvPixel12_16(int16_t y, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - uint32_t y32 = y << 4; - u = clamp255(u >> 4); - v = clamp255(v >> 4); + uint32_t y32 = (y << 4) | (y >> 8); + u = STATIC_CAST(uint8_t, clamp255(u >> 4)); + v = STATIC_CAST(uint8_t, clamp255(v >> 4)); CALC_RGB16; *b = b16; *g = g16; @@ -1747,9 +1800,9 @@ static __inline void YuvPixel10(uint16_t y, int g16; int r16; YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants); - *b = Clamp(b16 >> 6); - *g = Clamp(g16 >> 6); - *r = Clamp(r16 >> 6); + *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6)); + *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6)); + *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6)); } // C reference code that mimics the YUV 12 bit assembly. @@ -1765,9 +1818,9 @@ static __inline void YuvPixel12(uint16_t y, int g16; int r16; YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants); - *b = Clamp(b16 >> 6); - *g = Clamp(g16 >> 6); - *r = Clamp(r16 >> 6); + *b = STATIC_CAST(uint8_t, Clamp(b16 >> 6)); + *g = STATIC_CAST(uint8_t, Clamp(g16 >> 6)); + *r = STATIC_CAST(uint8_t, Clamp(r16 >> 6)); } // C reference code that mimics the YUV 16 bit assembly. @@ -1781,12 +1834,12 @@ static __inline void YuvPixel16_8(uint16_t y, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; uint32_t y32 = y; - u = clamp255(u >> 8); - v = clamp255(v >> 8); + u = STATIC_CAST(uint16_t, clamp255(u >> 8)); + v = STATIC_CAST(uint16_t, clamp255(v >> 8)); CALC_RGB16; - *b = Clamp((int32_t)(b16) >> 6); - *g = Clamp((int32_t)(g16) >> 6); - *r = Clamp((int32_t)(r16) >> 6); + *b = STATIC_CAST(uint8_t, Clamp((int32_t)(b16) >> 6)); + *g = STATIC_CAST(uint8_t, Clamp((int32_t)(g16) >> 6)); + *r = STATIC_CAST(uint8_t, Clamp((int32_t)(r16) >> 6)); } // C reference code that mimics the YUV 16 bit assembly. @@ -1800,8 +1853,8 @@ static __inline void YuvPixel16_16(uint16_t y, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; uint32_t y32 = y; - u = clamp255(u >> 8); - v = clamp255(v >> 8); + u = STATIC_CAST(uint16_t, clamp255(u >> 8)); + v = STATIC_CAST(uint16_t, clamp255(v >> 8)); CALC_RGB16; *b = b16; *g = g16; @@ -1815,7 +1868,7 @@ static __inline void YPixel(uint8_t y, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) || defined(__arm__) +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) int yg = yuvconstants->kRGBCoeffBias[0]; int ygb = yuvconstants->kRGBCoeffBias[4]; #else @@ -1823,9 +1876,9 @@ static __inline void YPixel(uint8_t y, int yg = yuvconstants->kYToRgb[0]; #endif uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = Clamp(((int32_t)(y1) + ygb) >> 6); - *g = Clamp(((int32_t)(y1) + ygb) >> 6); - *r = Clamp(((int32_t)(y1) + ygb) >> 6); + *b = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); + *g = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); + *r = STATIC_CAST(uint8_t, Clamp(((int32_t)(y1) + ygb) >> 6)); } void I444ToARGBRow_C(const uint8_t* src_y, @@ -1846,6 +1899,23 @@ void I444ToARGBRow_C(const uint8_t* src_y, } } +void I444ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 3; // Advance 1 pixel. + } +} + // Also used for 420 void I422ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, @@ -1929,10 +1999,10 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y, for (x = 0; x < width - 1; x += 2) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = clamp255(src_a[0] >> 2); + rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2)); YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); - rgb_buf[7] = clamp255(src_a[1] >> 2); + rgb_buf[7] = STATIC_CAST(uint8_t, clamp255(src_a[1] >> 2)); src_y += 2; src_u += 1; src_v += 1; @@ -1942,7 +2012,7 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y, if (width & 1) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = clamp255(src_a[0] >> 2); + rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2)); } } @@ -1957,7 +2027,7 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y, for (x = 0; x < width; ++x) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = clamp255(src_a[0] >> 2); + rgb_buf[3] = STATIC_CAST(uint8_t, clamp255(src_a[0] >> 2)); src_y += 1; src_u += 1; src_v += 1; @@ -2283,8 +2353,10 @@ void I422ToARGB4444Row_C(const uint8_t* src_y, b1 = b1 >> 4; g1 = g1 >> 4; r1 = r1 >> 4; - *(uint16_t*)(dst_argb4444 + 0) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; - *(uint16_t*)(dst_argb4444 + 2) = b1 | (g1 << 4) | (r1 << 8) | 0xf000; + *(uint16_t*)(dst_argb4444 + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000); + *(uint16_t*)(dst_argb4444 + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 4) | (r1 << 8) | 0xf000); src_y += 2; src_u += 1; src_v += 1; @@ -2295,7 +2367,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y, b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; - *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; + *(uint16_t*)(dst_argb4444) = + STATIC_CAST(uint16_t, b0 | (g0 << 4) | (r0 << 8) | 0xf000); } } @@ -2321,8 +2394,10 @@ void I422ToARGB1555Row_C(const uint8_t* src_y, b1 = b1 >> 3; g1 = g1 >> 3; r1 = r1 >> 3; - *(uint16_t*)(dst_argb1555 + 0) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; - *(uint16_t*)(dst_argb1555 + 2) = b1 | (g1 << 5) | (r1 << 10) | 0x8000; + *(uint16_t*)(dst_argb1555 + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000); + *(uint16_t*)(dst_argb1555 + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 10) | 0x8000); src_y += 2; src_u += 1; src_v += 1; @@ -2333,7 +2408,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y, b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; - *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; + *(uint16_t*)(dst_argb1555) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 10) | 0x8000); } } @@ -2359,8 +2435,10 @@ void I422ToRGB565Row_C(const uint8_t* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11); // for ubsan - *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11); + *(uint16_t*)(dst_rgb565 + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); + *(uint16_t*)(dst_rgb565 + 2) = + STATIC_CAST(uint16_t, b1 | (g1 << 5) | (r1 << 11)); src_y += 2; src_u += 1; src_v += 1; @@ -2371,7 +2449,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y, b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; - *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565 + 0) = + STATIC_CAST(uint16_t, b0 | (g0 << 5) | (r0 << 11)); } } @@ -2486,8 +2565,12 @@ void NV12ToRGB565Row_C(const uint8_t* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11); - *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11); + *(uint16_t*)(dst_rgb565 + 0) = STATIC_CAST(uint16_t, b0) | + STATIC_CAST(uint16_t, g0 << 5) | + STATIC_CAST(uint16_t, r0 << 11); + *(uint16_t*)(dst_rgb565 + 2) = STATIC_CAST(uint16_t, b1) | + STATIC_CAST(uint16_t, g1 << 5) | + STATIC_CAST(uint16_t, r1 << 11); src_y += 2; src_uv += 2; dst_rgb565 += 4; // Advance 2 pixels. @@ -2497,7 +2580,9 @@ void NV12ToRGB565Row_C(const uint8_t* src_y, b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; - *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565) = STATIC_CAST(uint16_t, b0) | + STATIC_CAST(uint16_t, g0 << 5) | + STATIC_CAST(uint16_t, r0 << 11); } } @@ -2603,6 +2688,19 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { } } +void MirrorRow_16_C(const uint16_t* src, uint16_t* dst, int width) { + int x; + src += width - 1; + for (x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; + } +} + void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) { int x; src_uv += (width - 1) << 1; @@ -2714,6 +2812,21 @@ void DetileRow_C(const uint8_t* src, } } +void DetileRow_16_C(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + int x; + for (x = 0; x < width - 15; x += 16) { + memcpy(dst, src, 16 * sizeof(uint16_t)); + dst += 16; + src += src_tile_stride; + } + if (width & 15) { + memcpy(dst, src, (width & 15) * sizeof(uint16_t)); + } +} + void DetileSplitUVRow_C(const uint8_t* src_uv, ptrdiff_t src_tile_stride, uint8_t* dst_u, @@ -2731,6 +2844,51 @@ void DetileSplitUVRow_C(const uint8_t* src_uv, } } +void DetileToYUY2_C(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + for (int x = 0; x < width - 15; x += 16) { + for (int i = 0; i < 8; i++) { + dst_yuy2[0] = src_y[0]; + dst_yuy2[1] = src_uv[0]; + dst_yuy2[2] = src_y[1]; + dst_yuy2[3] = src_uv[1]; + dst_yuy2 += 4; + src_y += 2; + src_uv += 2; + } + src_y += src_y_tile_stride - 16; + src_uv += src_uv_tile_stride - 16; + } +} + +// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded +// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the +// block contain all of the lower 2 bits of each pixel packed together, and the +// next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are +// packed into 1x4 blocks, whereas the upper bits are packed in normal raster +// order. +void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) { + for (size_t i = 0; i < size; i += 80) { + const uint8_t* src_lower_bits = src; + const uint8_t* src_upper_bits = src + 16; + + for (int j = 0; j < 4; j++) { + for (int k = 0; k < 16; k++) { + *dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 | + (uint16_t)*src_upper_bits << 8 | + (uint16_t)*src_upper_bits >> 2; + src_upper_bits++; + } + } + + src += 80; + } +} + void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -2823,10 +2981,10 @@ void MergeAR64Row_C(const uint16_t* src_r, int shift = 16 - depth; int max = (1 << depth) - 1; for (x = 0; x < width; ++x) { - dst_ar64[0] = ClampMax(src_b[x], max) << shift; - dst_ar64[1] = ClampMax(src_g[x], max) << shift; - dst_ar64[2] = ClampMax(src_r[x], max) << shift; - dst_ar64[3] = ClampMax(src_a[x], max) << shift; + dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift); + dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift); + dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift); + dst_ar64[3] = STATIC_CAST(uint16_t, ClampMax(src_a[x], max) << shift); dst_ar64 += 4; } } @@ -2843,10 +3001,10 @@ void MergeARGB16To8Row_C(const uint16_t* src_r, int x; int shift = depth - 8; for (x = 0; x < width; ++x) { - dst_argb[0] = clamp255(src_b[x] >> shift); - dst_argb[1] = clamp255(src_g[x] >> shift); - dst_argb[2] = clamp255(src_r[x] >> shift); - dst_argb[3] = clamp255(src_a[x] >> shift); + dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift)); + dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift)); + dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift)); + dst_argb[3] = STATIC_CAST(uint8_t, clamp255(src_a[x] >> shift)); dst_argb += 4; } } @@ -2863,9 +3021,9 @@ void MergeXR64Row_C(const uint16_t* src_r, int shift = 16 - depth; int max = (1 << depth) - 1; for (x = 0; x < width; ++x) { - dst_ar64[0] = ClampMax(src_b[x], max) << shift; - dst_ar64[1] = ClampMax(src_g[x], max) << shift; - dst_ar64[2] = ClampMax(src_r[x], max) << shift; + dst_ar64[0] = STATIC_CAST(uint16_t, ClampMax(src_b[x], max) << shift); + dst_ar64[1] = STATIC_CAST(uint16_t, ClampMax(src_g[x], max) << shift); + dst_ar64[2] = STATIC_CAST(uint16_t, ClampMax(src_r[x], max) << shift); dst_ar64[3] = 0xffff; dst_ar64 += 4; } @@ -2882,9 +3040,9 @@ void MergeXRGB16To8Row_C(const uint16_t* src_r, int x; int shift = depth - 8; for (x = 0; x < width; ++x) { - dst_argb[0] = clamp255(src_b[x] >> shift); - dst_argb[1] = clamp255(src_g[x] >> shift); - dst_argb[2] = clamp255(src_r[x] >> shift); + dst_argb[0] = STATIC_CAST(uint8_t, clamp255(src_b[x] >> shift)); + dst_argb[1] = STATIC_CAST(uint8_t, clamp255(src_g[x] >> shift)); + dst_argb[2] = STATIC_CAST(uint8_t, clamp255(src_r[x] >> shift)); dst_argb[3] = 0xff; dst_argb += 4; } @@ -2930,8 +3088,8 @@ void MergeUVRow_16_C(const uint16_t* src_u, assert(depth <= 16); int x; for (x = 0; x < width; ++x) { - dst_uv[0] = src_u[x] << shift; - dst_uv[1] = src_v[x] << shift; + dst_uv[0] = STATIC_CAST(uint16_t, src_u[x] << shift); + dst_uv[1] = STATIC_CAST(uint16_t, src_v[x] << shift); dst_uv += 2; } } @@ -2959,7 +3117,7 @@ void MultiplyRow_16_C(const uint16_t* src_y, int width) { int x; for (x = 0; x < width; ++x) { - dst_y[x] = src_y[x] * scale; + dst_y[x] = STATIC_CAST(uint16_t, src_y[x] * scale); } } @@ -2990,7 +3148,7 @@ void Convert16To8Row_C(const uint16_t* src_y, assert(scale <= 32768); for (x = 0; x < width; ++x) { - dst_y[x] = C16TO8(src_y[x], scale); + dst_y[x] = STATIC_CAST(uint8_t, C16TO8(src_y[x], scale)); } } @@ -3043,6 +3201,21 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2, } } +// Filter 2 rows of YUY2 UV's (422) into UV (NV12). +void YUY2ToNVUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_uv, + int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + int x; + for (x = 0; x < width; x += 2) { + dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_uv += 2; + } +} + // Copy row of YUY2 UV's (422) into U and V (422). void YUY2ToUV422Row_C(const uint8_t* src_yuy2, uint8_t* dst_u, @@ -3138,9 +3311,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb, uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; - dst_argb[0] = BLEND(fb, bb, a); - dst_argb[1] = BLEND(fg, bg, a); - dst_argb[2] = BLEND(fr, br, a); + dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a)); + dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a)); + dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a)); dst_argb[3] = 255u; fb = src_argb[4 + 0]; @@ -3150,9 +3323,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb, bb = src_argb1[4 + 0]; bg = src_argb1[4 + 1]; br = src_argb1[4 + 2]; - dst_argb[4 + 0] = BLEND(fb, bb, a); - dst_argb[4 + 1] = BLEND(fg, bg, a); - dst_argb[4 + 2] = BLEND(fr, br, a); + dst_argb[4 + 0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a)); + dst_argb[4 + 1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a)); + dst_argb[4 + 2] = STATIC_CAST(uint8_t, BLEND(fr, br, a)); dst_argb[4 + 3] = 255u; src_argb += 8; src_argb1 += 8; @@ -3167,9 +3340,9 @@ void ARGBBlendRow_C(const uint8_t* src_argb, uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; - dst_argb[0] = BLEND(fb, bb, a); - dst_argb[1] = BLEND(fg, bg, a); - dst_argb[2] = BLEND(fr, br, a); + dst_argb[0] = STATIC_CAST(uint8_t, BLEND(fb, bb, a)); + dst_argb[1] = STATIC_CAST(uint8_t, BLEND(fg, bg, a)); + dst_argb[2] = STATIC_CAST(uint8_t, BLEND(fr, br, a)); dst_argb[3] = 255u; } } @@ -3214,7 +3387,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); - dst_argb[3] = a; + dst_argb[3] = STATIC_CAST(uint8_t, a); b = src_argb[4]; g = src_argb[5]; r = src_argb[6]; @@ -3222,7 +3395,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { dst_argb[4] = ATTENUATE(b, a); dst_argb[5] = ATTENUATE(g, a); dst_argb[6] = ATTENUATE(r, a); - dst_argb[7] = a; + dst_argb[7] = STATIC_CAST(uint8_t, a); src_argb += 8; dst_argb += 8; } @@ -3235,7 +3408,7 @@ void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); - dst_argb[3] = a; + dst_argb[3] = STATIC_CAST(uint8_t, a); } } #undef ATTENUATE @@ -3307,10 +3480,10 @@ void ARGBUnattenuateRow_C(const uint8_t* src_argb, const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point // Clamping should not be necessary but is free in assembly. - dst_argb[0] = UNATTENUATE(b, ia); - dst_argb[1] = UNATTENUATE(g, ia); - dst_argb[2] = UNATTENUATE(r, ia); - dst_argb[3] = a; + dst_argb[0] = STATIC_CAST(uint8_t, UNATTENUATE(b, ia)); + dst_argb[1] = STATIC_CAST(uint8_t, UNATTENUATE(g, ia)); + dst_argb[2] = STATIC_CAST(uint8_t, UNATTENUATE(r, ia)); + dst_argb[3] = STATIC_CAST(uint8_t, a); src_argb += 4; dst_argb += 4; } @@ -3344,12 +3517,20 @@ void CumulativeSumToAverageRow_C(const int32_t* tl, int i; assert(area != 0); - ooa = 1.0f / area; + ooa = 1.0f / STATIC_CAST(float, area); for (i = 0; i < count; ++i) { - dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); - dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); - dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); - dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst[0] = + (uint8_t)(STATIC_CAST(float, bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * + ooa); + dst[1] = + (uint8_t)(STATIC_CAST(float, bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * + ooa); + dst[2] = + (uint8_t)(STATIC_CAST(float, bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * + ooa); + dst[3] = + (uint8_t)(STATIC_CAST(float, bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * + ooa); dst += 4; tl += 4; bl += 4; @@ -3407,7 +3588,9 @@ static void HalfRow_16To8_C(const uint16_t* src_uv, int width) { int x; for (x = 0; x < width; ++x) { - dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale); + dst_uv[x] = STATIC_CAST( + uint8_t, + C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale)); } } @@ -3433,8 +3616,9 @@ void InterpolateRow_C(uint8_t* dst_ptr, return; } for (x = 0; x < width; ++x) { - dst_ptr[0] = - (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + dst_ptr[0] = STATIC_CAST( + uint8_t, + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8); ++src_ptr; ++src_ptr1; ++dst_ptr; @@ -3463,8 +3647,9 @@ void InterpolateRow_16_C(uint16_t* dst_ptr, return; } for (x = 0; x < width; ++x) { - dst_ptr[0] = - (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; + dst_ptr[0] = STATIC_CAST( + uint16_t, + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8); ++src_ptr; ++src_ptr1; ++dst_ptr; @@ -3501,9 +3686,11 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr, return; } for (x = 0; x < width; ++x) { - dst_ptr[0] = C16TO8( - (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8, - scale); + dst_ptr[0] = STATIC_CAST( + uint8_t, + C16TO8( + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8, + scale)); src_ptr += 1; src_ptr1 += 1; dst_ptr += 1; @@ -3615,10 +3802,10 @@ void ARGBPolynomialRow_C(const uint8_t* src_argb, dr += poly[14] * r3; da += poly[15] * a3; - dst_argb[0] = Clamp((int32_t)(db)); - dst_argb[1] = Clamp((int32_t)(dg)); - dst_argb[2] = Clamp((int32_t)(dr)); - dst_argb[3] = Clamp((int32_t)(da)); + dst_argb[0] = STATIC_CAST(uint8_t, Clamp((int32_t)(db))); + dst_argb[1] = STATIC_CAST(uint8_t, Clamp((int32_t)(dg))); + dst_argb[2] = STATIC_CAST(uint8_t, Clamp((int32_t)(dr))); + dst_argb[3] = STATIC_CAST(uint8_t, Clamp((int32_t)(da))); src_argb += 4; dst_argb += 4; } @@ -4023,6 +4210,32 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, } #endif +#if defined(HAS_I444TORGB24ROW_AVX2) +void I444ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_u += twidth; + src_v += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + #if defined(HAS_NV12TORGB565ROW_AVX2) void NV12ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, @@ -4164,8 +4377,9 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) { void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) { int i; for (i = 0; i < width; ++i) { - *dst++ = - (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; + *dst++ = STATIC_CAST( + uint16_t, + (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8); ++src; } } @@ -4325,6 +4539,8 @@ void HalfMergeUVRow_C(const uint8_t* src_u, } } +#undef STATIC_CAST + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc index dce8c439..e94fd04d 100644 --- a/files/source/row_gcc.cc +++ b/files/source/row_gcc.cc @@ -27,6 +27,9 @@ static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; +static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u, + 77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u}; + static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) @@ -39,12 +42,18 @@ static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0}; +static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0, + -43, -84, 127, 0, -43, -84, 127, 0}; + static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0}; static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0}; +static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0, + 127, -107, -20, 0, 127, -107, -20, 0}; + // Constants for BGRA static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; @@ -729,7 +738,7 @@ void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( "movd %3,%%xmm6 \n" @@ -777,7 +786,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( "vbroadcastss %3,%%xmm6 \n" @@ -1201,6 +1210,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb, "lea 0x40(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_ar64), // %1 "+r"(width) // %2 @@ -1228,6 +1238,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, "lea 0x40(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_ab64), // %1 "+r"(width) // %2 @@ -1256,6 +1267,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_ar64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1284,6 +1296,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1398,6 +1411,24 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { } #endif // HAS_ARGBTOYJROW_SSSE3 +#ifdef HAS_ABGRTOYJROW_SSSE3 +// Convert 16 ABGR pixels (64 bytes) to 16 YJ values. +// Same as ABGRToYRow but different coefficients, no add 16. +void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN RGBTOY(xmm5) + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToYJ), // %3 + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ABGRTOYJROW_SSSE3 + #ifdef HAS_RGBATOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. @@ -1416,7 +1447,8 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { } #endif // HAS_RGBATOYJROW_SSSE3 -#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2) +#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \ + defined(HAS_ARGBEXTRACTALPHAROW_AVX2) // vpermd for vphaddw + vpackuswb vpermd. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; #endif @@ -1429,9 +1461,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" - - LABELALIGN RGBTOY_AVX2(ymm7) + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm7) "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1451,9 +1482,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" - - LABELALIGN RGBTOY_AVX2(ymm7) + "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm7) "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1472,9 +1502,8 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - - LABELALIGN RGBTOY_AVX2(ymm5) + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm5) "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1486,15 +1515,32 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { } #endif // HAS_ARGBTOYJROW_AVX2 +#ifdef HAS_ABGRTOYJROW_AVX2 +// Convert 32 ABGR pixels (128 bytes) to 32 Y values. +void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + ymm5) "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToYJ), // %3 + "m"(kSub128), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOYJROW_AVX2 + #ifdef HAS_RGBATOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - - LABELALIGN RGBTOY_AVX2( + "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 @@ -1571,11 +1617,15 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVROW_SSSE3 -#ifdef HAS_ARGBTOUVROW_AVX2 +#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \ + defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2) // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; +#endif + +#if defined(HAS_ARGBTOUVROW_AVX2) void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1765,6 +1815,71 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJROW_AVX2 +// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix +#ifdef HAS_ABGRTOUVJROW_AVX2 +void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kSub128), // %5 + "m"(kABGRToVJ), // %6 + "m"(kABGRToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOUVJROW_AVX2 + #ifdef HAS_ARGBTOUVJROW_SSSE3 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, @@ -1831,6 +1946,72 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJROW_SSSE3 +#ifdef HAS_ABGRTOUVJROW_SSSE3 +void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToVJ), // %5 + "m"(kABGRToUJ), // %6 + "m"(kSub128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} +#endif // HAS_ABGRTOUVJROW_SSSE3 + #ifdef HAS_ARGBTOUV444ROW_SSSE3 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, @@ -2153,9 +2334,6 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422 10 bit, upsample to 8 UV -// TODO(fbarchard): Consider shufb to replace pack/unpack -// TODO(fbarchard): Consider pmulhuw to replace psraw -// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. #define READYUV210 \ "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ @@ -2165,7 +2343,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ "psllw $6,%%xmm4 \n" \ + "psrlw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" #define READYUVA210 \ @@ -2177,7 +2358,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ "psllw $6,%%xmm4 \n" \ + "psrlw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \ "movdqu (%[a_buf]),%%xmm5 \n" \ "psraw $2,%%xmm5 \n" \ @@ -2196,7 +2380,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "punpckhwd %%xmm2,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ "psllw $6,%%xmm4 \n" \ + "psrlw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from 444 10 bit. With 8 Alpha. @@ -2211,7 +2398,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "punpckhwd %%xmm2,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $0x6,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $6,%%xmm4 \n" \ + "psrlw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \ "movdqu (%[a_buf]),%%xmm5 \n" \ "psraw $2,%%xmm5 \n" \ @@ -2228,7 +2418,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $0x4,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $4,%%xmm4 \n" \ + "psrlw $8,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. @@ -2399,6 +2592,20 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" +// Store 8 RGB24 values. +#define STORERGB24 \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm2,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "pshufb %%xmm5,%%xmm0 \n" \ + "pshufb %%xmm6,%%xmm1 \n" \ + "palignr $0xc,%%xmm0,%%xmm1 \n" \ + "movq %%xmm0,(%[dst_rgb24]) \n" \ + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" \ + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + // Store 8 AR30 values. #define STOREAR30 \ "psraw $0x4,%%xmm0 \n" \ @@ -2508,17 +2715,43 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, "1: \n" READYUV422 YUVTORGB(yuvconstants) - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0,(%[dst_rgb24]) \n" - "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" - "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + STORERGB24 + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} + +void OMITFP I444ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUV444 + YUVTORGB(yuvconstants) + STORERGB24 "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -3209,7 +3442,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsrlw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. @@ -3224,7 +3459,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsrlw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" \ "vmovdqu (%[a_buf]),%%ymm5 \n" \ "vpsraw $2,%%ymm5,%%ymm5 \n" \ @@ -3242,7 +3479,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsrlw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 212 12 bit, upsample to 16 UV @@ -3257,7 +3496,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $0x4,%%ymm4,%%ymm4 \n" \ + "vpsllw $4,%%ymm4,%%ymm2 \n" \ + "vpsrlw $8,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 16 UV from 410. With 16 Alpha. @@ -3271,7 +3512,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsrlw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" \ "vmovdqu (%[a_buf]),%%ymm5 \n" \ "vpsraw $2,%%ymm5,%%ymm5 \n" \ @@ -4785,6 +5028,84 @@ void DetileRow_SSE2(const uint8_t* src, } #endif // HAS_DETILEROW_SSE2 +#ifdef HAS_DETILEROW_16_SSE2 +void DetileRow_16_SSE2(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "xmm0", "xmm1"); +} +#endif // HAS_DETILEROW_SSE2 + +#ifdef HAS_DETILEROW_16_AVX +void DetileRow_16_AVX(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea (%0,%3,2),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride) // %3 + : "cc", "memory", "xmm0"); +} +#endif // HAS_DETILEROW_AVX + +#ifdef HAS_DETILETOYUY2_SSE2 +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_SSE2(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" // Load 16 Y + "sub $0x10,%3 \n" + "lea (%0,%4),%0 \n" + "movdqu (%1),%%xmm1 \n" // Load 8 UV + "lea (%1,%5),%1 \n" + "movdqu %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list + ); +} +#endif + #ifdef HAS_DETILESPLITUVROW_SSSE3 // TODO(greenjustin): Look into generating these constants instead of loading // them since this can cause branch mispredicts for fPIC code on 32-bit @@ -4821,36 +5142,59 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, } #endif // HAS_DETILESPLITUVROW_SSSE3 +#ifdef HAS_MERGEUVROW_AVX512BW +void MergeUVRow_AVX512BW(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + asm volatile("sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%0),%%zmm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" + "lea 0x20(%0),%0 \n" + "vpsllw $0x8,%%zmm1,%%zmm1 \n" + "vporq %%zmm0,%%zmm1,%%zmm2 \n" + "vmovdqu64 %%zmm2,(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGEUVROW_AVX512BW + #ifdef HAS_MERGEUVROW_AVX2 void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile("sub %0,%1 \n" - "sub %0,%1 \n" - - LABELALIGN + LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" + "vpmovzxbw (%0),%%ymm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" + "vpsllw $0x8,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX2 @@ -4859,11 +5203,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile( + asm volatile("sub %0,%1 \n" - "sub %0,%1 \n" - - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" @@ -4876,12 +5218,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u, "lea 0x20(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_SSE2 @@ -4891,37 +5233,35 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, uint16_t* dst_uv, int depth, int width) { - depth = 16 - depth; // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" + "vmovd %5,%%xmm4 \n" + + "sub %0,%1 \n" + // 8 pixels per loop. - // 16 pixels per loop. - LABELALIGN + LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu (%0,%1,1),%%ymm1 \n" - "add $0x20,%0 \n" - + "vpmovzxwd (%0),%%ymm0 \n" + "vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" "vpsllw %%xmm3,%%ymm0,%%ymm0 \n" - "vpsllw %%xmm3,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates - "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2,(%2) \n" - "vextractf128 $0x0,%%ymm0,0x10(%2) \n" - "vextractf128 $0x1,%%ymm2,0x20(%2) \n" - "vextractf128 $0x1,%%ymm0,0x30(%2) \n" - "add $0x40,%2 \n" - "sub $0x10,%3 \n" + "vpslld %%xmm4,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : "r"(depth) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(16 - depth), // %4 + "r"(32 - depth) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); // clang-format on } #endif // HAS_MERGEUVROW_AVX2 @@ -5127,7 +5467,6 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, // 512 = 9 bits // 1024 = 10 bits // 4096 = 12 bits -// TODO(fbarchard): reduce to SSE2 void Convert8To16Row_SSE2(const uint8_t* src_y, uint16_t* dst_y, int scale, @@ -6178,6 +6517,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { "lea 0x40(%1),%1 \n" "sub $0x40,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -6461,6 +6801,33 @@ void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm5"); } +void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, @@ -6661,6 +7028,35 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm5"); } +void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%3,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 + : "memory", "cc", "xmm0", "xmm1"); +} + void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, diff --git a/files/source/row_lasx.cc b/files/source/row_lasx.cc index 7dd18f40..1082ad80 100644 --- a/files/source/row_lasx.cc +++ b/files/source/row_lasx.cc @@ -775,40 +775,6 @@ void UYVYToUV422Row_LASX(const uint8_t* src_uyvy, } } -void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2, src3, vec0, vec1, vec2, vec3; - __m256i tmp0, tmp1, dst0; - __m256i const_19 = __lasx_xvldi(0x19); - __m256i const_42 = __lasx_xvldi(0x42); - __m256i const_81 = __lasx_xvldi(0x81); - __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, - 0x1080108010801080, 0x1080108010801080}; - __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, - 0x0000000700000003}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64, - src_argb0, 96, src0, src1, src2, src3); - vec0 = __lasx_xvpickev_b(src1, src0); - vec1 = __lasx_xvpickev_b(src3, src2); - vec2 = __lasx_xvpickod_b(src1, src0); - vec3 = __lasx_xvpickod_b(src3, src2); - tmp0 = __lasx_xvmaddwev_h_bu(const_1080, vec0, const_19); - tmp1 = __lasx_xvmaddwev_h_bu(const_1080, vec1, const_19); - tmp0 = __lasx_xvmaddwev_h_bu(tmp0, vec2, const_81); - tmp1 = __lasx_xvmaddwev_h_bu(tmp1, vec3, const_81); - tmp0 = __lasx_xvmaddwod_h_bu(tmp0, vec0, const_42); - tmp1 = __lasx_xvmaddwod_h_bu(tmp1, vec1, const_42); - dst0 = __lasx_xvssrani_b_h(tmp1, tmp0, 8); - dst0 = __lasx_xvperm_w(dst0, control); - __lasx_xvst(dst0, dst_y, 0); - src_argb0 += 128; - dst_y += 32; - } -} - void ARGBToUVRow_LASX(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, @@ -1216,7 +1182,7 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb, void ARGBToRGB565DitherRow_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { int x; int len = width / 16; @@ -1811,48 +1777,6 @@ void RGB565ToUVRow_LASX(const uint8_t* src_rgb565, } } -void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1, reg2, dst0; - __m256i const_129 = __lasx_xvldi(129); - __m256i const_br = {0x4219421942194219, 0x4219421942194219, - 0x4219421942194219, 0x4219421942194219}; - __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, - 0x1080108010801080, 0x1080108010801080}; - __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200, - 0x17151412110F0E0C}; - __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18, - 0x0F0D0C0A09070604}; - __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001, - 0x001600130010000D}; - __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019, - 0x000E000B00080005}; - - for (x = 0; x < len; x++) { - reg0 = __lasx_xvld(src_rgb24, 0); - reg1 = __lasx_xvld(src_rgb24, 32); - reg2 = __lasx_xvld(src_rgb24, 64); - src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); - src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); - src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); - tmp0 = __lasx_xvshuf_b(src1, src0, shuff0); - tmp1 = __lasx_xvshuf_b(src1, src2, shuff1); - tmp2 = __lasx_xvshuf_b(src1, src0, shuff2); - tmp3 = __lasx_xvshuf_b(src1, src2, shuff3); - reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129); - reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129); - reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1); - dst0 = __lasx_xvpickod_b(reg1, reg0); - __lasx_xvst(dst0, dst_y, 0); - dst_y += 32; - src_rgb24 += 96; - } -} - void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -1916,48 +1840,6 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24, } } -void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1, reg2, dst0; - __m256i const_129 = __lasx_xvldi(129); - __m256i const_br = {0x1942194219421942, 0x1942194219421942, - 0x1942194219421942, 0x1942194219421942}; - __m256i const_1080 = {0x1080108010801080, 0x1080108010801080, - 0x1080108010801080, 0x1080108010801080}; - __m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200, - 0x17151412110F0E0C}; - __m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18, - 0x0F0D0C0A09070604}; - __m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001, - 0x001600130010000D}; - __m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019, - 0x000E000B00080005}; - - for (x = 0; x < len; x++) { - reg0 = __lasx_xvld(src_raw, 0); - reg1 = __lasx_xvld(src_raw, 32); - reg2 = __lasx_xvld(src_raw, 64); - src0 = __lasx_xvpermi_q(reg1, reg0, 0x30); - src1 = __lasx_xvpermi_q(reg2, reg0, 0x21); - src2 = __lasx_xvpermi_q(reg2, reg1, 0x30); - tmp0 = __lasx_xvshuf_b(src1, src0, shuff0); - tmp1 = __lasx_xvshuf_b(src1, src2, shuff1); - tmp2 = __lasx_xvshuf_b(src1, src0, shuff2); - tmp3 = __lasx_xvshuf_b(src1, src2, shuff3); - reg0 = __lasx_xvmaddwev_h_bu(const_1080, tmp2, const_129); - reg1 = __lasx_xvmaddwev_h_bu(const_1080, tmp3, const_129); - reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp1); - dst0 = __lasx_xvpickod_b(reg1, reg0); - __lasx_xvst(dst0, dst_y, 0); - dst_y += 32; - src_raw += 96; - } -} - void RAWToUVRow_LASX(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, @@ -2118,36 +2000,228 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y, } } -void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - int len = width / 32; - __m256i src0, src1, src2, src3, dst0; - __m256i tmp0, tmp1, tmp2, tmp3; - __m256i reg0, reg1; - __m256i const_128 = __lasx_xvldi(0x480); - __m256i const_150 = __lasx_xvldi(0x96); - __m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D, - 0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; - __m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002, - 0x0000000700000003}; +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr20, %4, 0 \n\t" // load shuff + "1: \n\t" + "xvld $xr4, %0, 0 \n\t" + "xvld $xr5, %0, 32 \n\t" + "xvld $xr6, %0, 64 \n\t" + "xvld $xr7, %0, 96 \n\t" // load 32 pixels of + // ARGB + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpickev.b $xr8, $xr5, $xr4 \n\t" // BR + "xvpickev.b $xr10, $xr7, $xr6 \n\t" + "xvpickod.b $xr9, $xr5, $xr4 \n\t" // GA + "xvpickod.b $xr11, $xr7, $xr6 \n\t" + "xvmaddwev.h.bu $xr12, $xr8, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr10, $xr0 \n\t" + "xvmaddwev.h.bu $xr12, $xr9, $xr1 \n\t" // G + "xvmaddwev.h.bu $xr13, $xr11, $xr1 \n\t" + "xvmaddwod.h.bu $xr12, $xr8, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr10, $xr2 \n\t" + "addi.d %0, %0, 128 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvperm.w $xr11, $xr10, $xr20 \n\t" + "xvst $xr11, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_argb), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), "r"(shuff) + : "memory"); +} - for (x = 0; x < len; x++) { - DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb, - 96, src0, src1, src2, src3); - tmp0 = __lasx_xvpickev_b(src1, src0); - tmp1 = __lasx_xvpickod_b(src1, src0); - tmp2 = __lasx_xvpickev_b(src3, src2); - tmp3 = __lasx_xvpickod_b(src3, src2); - reg0 = __lasx_xvmaddwev_h_bu(const_128, tmp1, const_150); - reg1 = __lasx_xvmaddwev_h_bu(const_128, tmp3, const_150); - reg0 = __lasx_xvdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lasx_xvdp2add_h_bu(reg1, const_br, tmp2); - dst0 = __lasx_xvpickod_b(reg1, reg0); - dst0 = __lasx_xvperm_w(dst0, shuff); - __lasx_xvst(dst0, dst_y, 0); - dst_y += 32; - src_argb += 128; - } +void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LASX(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LASX(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LASX(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LASX(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr20, %4, 0 \n\t" // load shuff + "1: \n\t" + "xvld $xr4, %0, 0 \n\t" + "xvld $xr5, %0, 32 \n\t" + "xvld $xr6, %0, 64 \n\t" + "xvld $xr7, %0, 96 \n\t" // load 32 pixels of + // RGBA + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpickev.b $xr8, $xr5, $xr4 \n\t" // AG + "xvpickev.b $xr10, $xr7, $xr6 \n\t" + "xvpickod.b $xr9, $xr5, $xr4 \n\t" // BR + "xvpickod.b $xr11, $xr7, $xr6 \n\t" + "xvmaddwev.h.bu $xr12, $xr9, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr11, $xr0 \n\t" + "xvmaddwod.h.bu $xr12, $xr8, $xr1 \n\t" // G + "xvmaddwod.h.bu $xr13, $xr10, $xr1 \n\t" + "xvmaddwod.h.bu $xr12, $xr9, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr11, $xr2 \n\t" + "addi.d %0, %0, 128 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvperm.w $xr11, $xr10, $xr20 \n\t" + "xvst $xr11, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), "r"(shuff) + : "memory"); +} + +void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LASX(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_LASX(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LASX(src_bgra, dst_y, width, &kRawI601Constants); +} + +static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int8_t shuff[128] = { + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23, + 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, + 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15, + 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, + 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, + 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, + 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; + asm volatile( + "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants + "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants + "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants + "xvldrepl.h $xr3, %3, 4 \n\t" // load rgbconstants + "xvld $xr4, %4, 0 \n\t" // load shuff + "xvld $xr5, %4, 32 \n\t" + "xvld $xr6, %4, 64 \n\t" + "xvld $xr7, %4, 96 \n\t" + "1: \n\t" + "xvld $xr8, %0, 0 \n\t" + "xvld $xr9, %0, 32 \n\t" + "xvld $xr10, %0, 64 \n\t" // load 32 pixels of + // RGB + "xvor.v $xr12, $xr3, $xr3 \n\t" + "xvor.v $xr13, $xr3, $xr3 \n\t" + "xvor.v $xr11, $xr9, $xr9 \n\t" + "addi.d %2, %2, -32 \n\t" // 32 processed per + // loop. + "xvpermi.q $xr9, $xr8, 0x30 \n\t" // src0 + "xvpermi.q $xr8, $xr10, 0x03 \n\t" // src1 + "xvpermi.q $xr10, $xr11, 0x30 \n\t" // src2 + "xvshuf.b $xr14, $xr8, $xr9, $xr4 \n\t" + "xvshuf.b $xr15, $xr8, $xr10, $xr5 \n\t" + "xvshuf.b $xr16, $xr8, $xr9, $xr6 \n\t" + "xvshuf.b $xr17, $xr8, $xr10, $xr7 \n\t" + "xvmaddwev.h.bu $xr12, $xr16, $xr1 \n\t" // G + "xvmaddwev.h.bu $xr13, $xr17, $xr1 \n\t" + "xvmaddwev.h.bu $xr12, $xr14, $xr0 \n\t" // B + "xvmaddwev.h.bu $xr13, $xr15, $xr0 \n\t" + "xvmaddwod.h.bu $xr12, $xr14, $xr2 \n\t" // R + "xvmaddwod.h.bu $xr13, $xr15, $xr2 \n\t" + "addi.d %0, %0, 96 \n\t" + "xvpickod.b $xr10, $xr13, $xr12 \n\t" + "xvst $xr10, %1, 0 \n\t" + "addi.d %1, %1, 32 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), // %3 + "r"(shuff) // %4 + : "memory"); +} + +void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants); } void ARGBToUVJRow_LASX(const uint8_t* src_argb, diff --git a/files/source/row_lsx.cc b/files/source/row_lsx.cc index 3e8b901a..e626072a 100644 --- a/files/source/row_lsx.cc +++ b/files/source/row_lsx.cc @@ -31,6 +31,91 @@ extern "C" { yb = __lsx_vreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \ } +// Load 32 YUV422 pixel data +#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \ + { \ + __m128i temp0, temp1; \ + \ + DUP2_ARG2(__lsx_vld, psrc_y, 0, psrc_u, 0, out_y, temp0); \ + temp1 = __lsx_vld(psrc_v, 0); \ + temp0 = __lsx_vsub_b(temp0, const_80); \ + temp1 = __lsx_vsub_b(temp1, const_80); \ + temp0 = __lsx_vsllwil_h_b(temp0, 0); \ + temp1 = __lsx_vsllwil_h_b(temp1, 0); \ + uv_l = __lsx_vilvl_h(temp0, temp1); \ + uv_h = __lsx_vilvh_h(temp0, temp1); \ + } + +// Load 16 YUV422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \ + { \ + __m128i temp0, temp1; \ + \ + out_y = __lsx_vld(psrc_y, 0); \ + temp0 = __lsx_vldrepl_d(psrc_u, 0); \ + temp1 = __lsx_vldrepl_d(psrc_v, 0); \ + uv = __lsx_vilvl_b(temp0, temp1); \ + uv = __lsx_vsub_b(uv, const_80); \ + uv = __lsx_vsllwil_h_b(uv, 0); \ + } + +// Convert 16 pixels of YUV420 to RGB. +#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, yg, yb, b_l, b_h, g_l, \ + g_h, r_l, r_h) \ + { \ + __m128i u_l, u_h, v_l, v_h; \ + __m128i yl_ev, yl_od, yh_ev, yh_od; \ + __m128i temp0, temp1, temp2, temp3; \ + \ + temp0 = __lsx_vilvl_b(in_y, in_y); \ + temp1 = __lsx_vilvh_b(in_y, in_y); \ + yl_ev = __lsx_vmulwev_w_hu_h(temp0, yg); \ + yl_od = __lsx_vmulwod_w_hu_h(temp0, yg); \ + yh_ev = __lsx_vmulwev_w_hu_h(temp1, yg); \ + yh_od = __lsx_vmulwod_w_hu_h(temp1, yg); \ + DUP4_ARG2(__lsx_vsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \ + yl_ev, yl_od, yh_ev, yh_od); \ + yl_ev = __lsx_vadd_w(yl_ev, yb); \ + yl_od = __lsx_vadd_w(yl_od, yb); \ + yh_ev = __lsx_vadd_w(yh_ev, yb); \ + yh_od = __lsx_vadd_w(yh_od, yb); \ + v_l = __lsx_vmulwev_w_h(in_uvl, ubvr); \ + u_l = __lsx_vmulwod_w_h(in_uvl, ubvr); \ + v_h = __lsx_vmulwev_w_h(in_uvh, ubvr); \ + u_h = __lsx_vmulwod_w_h(in_uvh, ubvr); \ + temp0 = __lsx_vadd_w(yl_ev, u_l); \ + temp1 = __lsx_vadd_w(yl_od, u_l); \ + temp2 = __lsx_vadd_w(yh_ev, u_h); \ + temp3 = __lsx_vadd_w(yh_od, u_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + b_l = __lsx_vpackev_h(temp1, temp0); \ + b_h = __lsx_vpackev_h(temp3, temp2); \ + temp0 = __lsx_vadd_w(yl_ev, v_l); \ + temp1 = __lsx_vadd_w(yl_od, v_l); \ + temp2 = __lsx_vadd_w(yh_ev, v_h); \ + temp3 = __lsx_vadd_w(yh_od, v_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + r_l = __lsx_vpackev_h(temp1, temp0); \ + r_h = __lsx_vpackev_h(temp3, temp2); \ + DUP2_ARG2(__lsx_vdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \ + temp0 = __lsx_vsub_w(yl_ev, u_l); \ + temp1 = __lsx_vsub_w(yl_od, u_l); \ + temp2 = __lsx_vsub_w(yh_ev, u_h); \ + temp3 = __lsx_vsub_w(yh_od, u_h); \ + DUP4_ARG2(__lsx_vsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, temp0, \ + temp1, temp2, temp3); \ + DUP4_ARG1(__lsx_vclip255_w, temp0, temp1, temp2, temp3, temp0, temp1, \ + temp2, temp3); \ + g_l = __lsx_vpackev_h(temp1, temp0); \ + g_h = __lsx_vpackev_h(temp3, temp2); \ + } + // Convert 8 pixels of YUV420 to RGB. #define YUVTORGB(in_y, in_vu, vrub, vgug, yg, yb, out_b, out_g, out_r) \ { \ @@ -118,42 +203,1083 @@ extern "C" { out_g = __lsx_vpackev_h(tmp1, tmp0); \ } -// Pack and Store 8 ARGB values. -#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ - { \ - __m128i temp0, temp1; \ - __m128i dst0, dst1; \ - \ - temp0 = __lsx_vpackev_b(in_g, in_b); \ - temp1 = __lsx_vpackev_b(in_a, in_r); \ - dst0 = __lsx_vilvl_h(temp1, temp0); \ - dst1 = __lsx_vilvh_h(temp1, temp0); \ - __lsx_vst(dst0, pdst_argb, 0); \ - __lsx_vst(dst1, pdst_argb, 16); \ - pdst_argb += 32; \ +// Pack and Store 16 ARGB values. +#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, pdst_argb) \ + { \ + __m128i temp0, temp1, temp2, temp3; \ + temp0 = __lsx_vpackev_b(g_l, b_l); \ + temp1 = __lsx_vpackev_b(a_l, r_l); \ + temp2 = __lsx_vpackev_b(g_h, b_h); \ + temp3 = __lsx_vpackev_b(a_h, r_h); \ + r_l = __lsx_vilvl_h(temp1, temp0); \ + r_h = __lsx_vilvh_h(temp1, temp0); \ + g_l = __lsx_vilvl_h(temp3, temp2); \ + g_h = __lsx_vilvh_h(temp3, temp2); \ + __lsx_vst(r_l, pdst_argb, 0); \ + __lsx_vst(r_h, pdst_argb, 16); \ + __lsx_vst(g_l, pdst_argb, 32); \ + __lsx_vst(g_h, pdst_argb, 48); \ + pdst_argb += 64; \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \ + { \ + __m128i temp0, temp1; \ + __m128i dst0, dst1; \ + \ + temp0 = __lsx_vpackev_b(in_g, in_b); \ + temp1 = __lsx_vpackev_b(in_a, in_r); \ + dst0 = __lsx_vilvl_h(temp1, temp0); \ + dst1 = __lsx_vilvh_h(temp1, temp0); \ + __lsx_vst(dst0, pdst_argb, 0); \ + __lsx_vst(dst1, pdst_argb, 16); \ + pdst_argb += 32; \ + } + +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ + { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ + __m128i _reg0, _reg1; \ + _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \ + _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \ + _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \ + _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \ + _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \ + _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \ + _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \ + _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \ + _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \ + _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \ + _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \ + _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \ + _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \ + _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \ + _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \ + _dst0 = __lsx_vpickod_b(_reg1, _reg0); \ + } + +void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 32; + __m128i src0, src1; + __m128i shuffler = {0x08090A0B0C0D0E0F, 0x0001020304050607}; + src += width - 32; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + __lsx_vst(src1, dst, 0); + __lsx_vst(src0, dst, 16); + dst += 32; + src -= 32; + } +} + +void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + int x; + int len = width / 8; + __m128i src, dst; + __m128i shuffler = {0x0004000500060007, 0x0000000100020003}; + + src_uv += (width - 8) << 1; + for (x = 0; x < len; x++) { + src = __lsx_vld(src_uv, 0); + dst = __lsx_vshuf_h(shuffler, src, src); + __lsx_vst(dst, dst_uv, 0); + src_uv -= 16; + dst_uv += 16; + } +} + +void ARGBMirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width) { + int x; + int len = width / 8; + __m128i src0, src1; + __m128i shuffler = {0x0B0A09080F0E0D0C, 0x0302010007060504}; + + src += (width * 4) - 32; + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, shuffler, src1, src1, shuffler, src0, + src1); + __lsx_vst(src1, dst, 0); + __lsx_vst(src0, dst, 16); + dst += 32; + src -= 32; + } +} + +void I422ToYUY2Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + int len = width / 16; + __m128i src_u0, src_v0, src_y0, vec_uv0; + __m128i vec_yuy2_0, vec_yuy2_1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lsx_vld(src_y, 0); + vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); + vec_yuy2_0 = __lsx_vilvl_b(vec_uv0, src_y0); + vec_yuy2_1 = __lsx_vilvh_b(vec_uv0, src_y0); + __lsx_vst(vec_yuy2_0, dst_yuy2, 0); + __lsx_vst(vec_yuy2_1, dst_yuy2, 16); + src_u += 8; + src_v += 8; + src_y += 16; + dst_yuy2 += 32; + } +} + +void I422ToUYVYRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + int len = width / 16; + __m128i src_u0, src_v0, src_y0, vec_uv0; + __m128i vec_uyvy0, vec_uyvy1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_u, 0, src_v, 0, src_u0, src_v0); + src_y0 = __lsx_vld(src_y, 0); + vec_uv0 = __lsx_vilvl_b(src_v0, src_u0); + vec_uyvy0 = __lsx_vilvl_b(src_y0, vec_uv0); + vec_uyvy1 = __lsx_vilvh_b(src_y0, vec_uv0); + __lsx_vst(vec_uyvy0, dst_uyvy, 0); + __lsx_vst(vec_uyvy1, dst_uyvy, 16); + src_u += 8; + src_v += 8; + src_y += 16; + dst_uyvy += 32; + } +} + +void I422ToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_ug, vec_vr, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i alpha = __lsx_vldi(0xFF); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422ToRGBARow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i alpha = __lsx_vldi(0xFF); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422AlphaToARGBRow_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + int res = width & 15; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i zero = __lsx_vldi(0); + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h; + + y = __lsx_vld(src_a, 0); + a_l = __lsx_vilvl_b(zero, y); + a_h = __lsx_vilvh_b(zero, y); + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb); + src_y += 16; + src_u += 8; + src_v += 8; + src_a += 16; + } + if (res) { + __m128i y, uv, r, g, b, a; + a = __lsx_vld(src_a, 0); + a = __lsx_vsllwil_hu_bu(a, 0); + READYUV422(src_y, src_u, src_v, y, uv); + YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r); + STOREARGB(a, r, g, b, dst_argb); + } +} + +void I422ToRGB24Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i shuffler0 = {0x0504120302100100, 0x0A18090816070614}; + __m128i shuffler1 = {0x1E0F0E1C0D0C1A0B, 0x1E0F0E1C0D0C1A0B}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + __m128i temp0, temp1, temp2, temp3; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + temp0 = __lsx_vpackev_b(g_l, b_l); + temp1 = __lsx_vpackev_b(g_h, b_h); + DUP4_ARG3(__lsx_vshuf_b, r_l, temp0, shuffler1, r_h, temp1, shuffler1, r_l, + temp0, shuffler0, r_h, temp1, shuffler0, temp2, temp3, temp0, + temp1); + + b_l = __lsx_vilvl_d(temp1, temp2); + b_h = __lsx_vilvh_d(temp3, temp1); + __lsx_vst(temp0, dst_argb, 0); + __lsx_vst(b_l, dst_argb, 16); + __lsx_vst(b_h, dst_argb, 32); + dst_argb += 48; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 3); + b_h = __lsx_vsrli_h(b_h, 3); + g_l = __lsx_vsrli_h(g_l, 2); + g_h = __lsx_vsrli_h(g_h, 2); + r_l = __lsx_vsrli_h(r_l, 3); + r_h = __lsx_vsrli_h(r_h, 3); + r_l = __lsx_vslli_h(r_l, 11); + r_h = __lsx_vslli_h(r_h, 11); + g_l = __lsx_vslli_h(g_l, 5); + g_h = __lsx_vslli_h(g_h, 5); + r_l = __lsx_vor_v(r_l, g_l); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, g_h); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_rgb565, 0); + __lsx_vst(r_h, dst_rgb565, 16); + dst_rgb565 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i alpha = {0xF000F000F000F000, 0xF000F000F000F000}; + __m128i mask = {0x00F000F000F000F0, 0x00F000F000F000F0}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 4); + b_h = __lsx_vsrli_h(b_h, 4); + r_l = __lsx_vsrli_h(r_l, 4); + r_h = __lsx_vsrli_h(r_h, 4); + g_l = __lsx_vand_v(g_l, mask); + g_h = __lsx_vand_v(g_h, mask); + r_l = __lsx_vslli_h(r_l, 8); + r_h = __lsx_vslli_h(r_h, 8); + r_l = __lsx_vor_v(r_l, alpha); + r_h = __lsx_vor_v(r_h, alpha); + r_l = __lsx_vor_v(r_l, g_l); + r_h = __lsx_vor_v(r_h, g_h); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_argb4444, 0); + __lsx_vst(r_h, dst_argb4444, 16); + dst_argb4444 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void I422ToARGB1555Row_LSX(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int len = width / 16; + __m128i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg; + __m128i vec_ubvr, vec_ugvg; + __m128i const_80 = __lsx_vldi(0x80); + __m128i alpha = {0x8000800080008000, 0x8000800080008000}; + + YUVTORGB_SETUP(yuvconstants, vec_vr, vec_ub, vec_vg, vec_ug, vec_yg, vec_yb); + vec_ubvr = __lsx_vilvl_h(vec_ub, vec_vr); + vec_ugvg = __lsx_vilvl_h(vec_ug, vec_vg); + + for (x = 0; x < len; x++) { + __m128i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h; + + READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h); + YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b_l, b_h, g_l, + g_h, r_l, r_h); + b_l = __lsx_vsrli_h(b_l, 3); + b_h = __lsx_vsrli_h(b_h, 3); + g_l = __lsx_vsrli_h(g_l, 3); + + g_h = __lsx_vsrli_h(g_h, 3); + g_l = __lsx_vslli_h(g_l, 5); + g_h = __lsx_vslli_h(g_h, 5); + r_l = __lsx_vsrli_h(r_l, 3); + r_h = __lsx_vsrli_h(r_h, 3); + r_l = __lsx_vslli_h(r_l, 10); + r_h = __lsx_vslli_h(r_h, 10); + r_l = __lsx_vor_v(r_l, alpha); + r_h = __lsx_vor_v(r_h, alpha); + r_l = __lsx_vor_v(r_l, g_l); + r_h = __lsx_vor_v(r_h, g_h); + r_l = __lsx_vor_v(r_l, b_l); + r_h = __lsx_vor_v(r_h, b_h); + __lsx_vst(r_l, dst_argb1555, 0); + __lsx_vst(r_h, dst_argb1555, 16); + dst_argb1555 += 32; + src_y += 16; + src_u += 8; + src_v += 8; + } +} + +void YUY2ToYRow_LSX(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); + dst0 = __lsx_vpickev_b(src1, src0); + __lsx_vst(dst0, dst_y, 0); + src_yuy2 += 32; + dst_y += 16; + } +} + +void YUY2ToUVRow_LSX(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src_yuy2_next, 0, + src_yuy2_next, 16, src0, src1, src2, src3); + src0 = __lsx_vpickod_b(src1, src0); + src1 = __lsx_vpickod_b(src3, src2); + tmp0 = __lsx_vavgr_bu(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_yuy2 += 32; + src_yuy2_next += 32; + dst_u += 8; + dst_v += 8; + } +} + +void YUY2ToUV422Row_LSX(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_yuy2, 0, src_yuy2, 16, src0, src1); + tmp0 = __lsx_vpickod_b(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_yuy2 += 32; + dst_u += 8; + dst_v += 8; + } +} + +void UYVYToYRow_LSX(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + int len = width / 16; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1); + dst0 = __lsx_vpickod_b(src1, src0); + __lsx_vst(dst0, dst_y, 0); + src_uyvy += 32; + dst_y += 16; + } +} + +void UYVYToUVRow_LSX(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + int len = width / 16; + __m128i src0, src1, src2, src3, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src_uyvy_next, 0, + src_uyvy_next, 16, src0, src1, src2, src3); + src0 = __lsx_vpickev_b(src1, src0); + src1 = __lsx_vpickev_b(src3, src2); + tmp0 = __lsx_vavgr_bu(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_uyvy += 32; + src_uyvy_next += 32; + dst_u += 8; + dst_v += 8; + } +} + +void UYVYToUV422Row_LSX(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + __m128i src0, src1, tmp0, dst0, dst1; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_uyvy, 0, src_uyvy, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + dst0 = __lsx_vpickev_b(tmp0, tmp0); + dst1 = __lsx_vpickod_b(tmp0, tmp0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst1, dst_v, 0, 0); + src_uyvy += 32; + dst_u += 8; + dst_v += 8; + } +} + +void ARGBToUVRow_LSX(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + int len = width / 16; + const uint8_t* src_argb1 = src_argb0 + src_stride_argb; + + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i vec0, vec1, vec2, vec3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1; + __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038}; + __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025}; + __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013}; + __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f}; + __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009}; + __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0, + 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_argb1, 0, src_argb1, 16, src_argb1, 32, src_argb1, + 48, src4, src5, src6, src7); + vec0 = __lsx_vaddwev_h_bu(src0, src4); + vec1 = __lsx_vaddwev_h_bu(src1, src5); + vec2 = __lsx_vaddwev_h_bu(src2, src6); + vec3 = __lsx_vaddwev_h_bu(src3, src7); + tmp0 = __lsx_vpickev_h(vec1, vec0); + tmp1 = __lsx_vpickev_h(vec3, vec2); + tmp2 = __lsx_vpickod_h(vec1, vec0); + tmp3 = __lsx_vpickod_h(vec3, vec2); + vec0 = __lsx_vaddwod_h_bu(src0, src4); + vec1 = __lsx_vaddwod_h_bu(src1, src5); + vec2 = __lsx_vaddwod_h_bu(src2, src6); + vec3 = __lsx_vaddwod_h_bu(src3, src7); + tmp4 = __lsx_vpickev_h(vec1, vec0); + tmp5 = __lsx_vpickev_h(vec3, vec2); + vec0 = __lsx_vpickev_h(tmp1, tmp0); + vec1 = __lsx_vpickod_h(tmp1, tmp0); + src0 = __lsx_vavgr_h(vec0, vec1); + vec0 = __lsx_vpickev_h(tmp3, tmp2); + vec1 = __lsx_vpickod_h(tmp3, tmp2); + src1 = __lsx_vavgr_h(vec0, vec1); + vec0 = __lsx_vpickev_h(tmp5, tmp4); + vec1 = __lsx_vpickod_h(tmp5, tmp4); + src2 = __lsx_vavgr_h(vec0, vec1); + dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70); + dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A); + dst0 = __lsx_vmsub_h(dst0, src1, const_0x26); + dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70); + dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E); + dst1 = __lsx_vmsub_h(dst1, src0, const_0x12); + dst0 = __lsx_vsrai_h(dst0, 8); + dst1 = __lsx_vsrai_h(dst1, 8); + dst0 = __lsx_vpickev_b(dst1, dst0); + __lsx_vstelm_d(dst0, dst_u, 0, 0); + __lsx_vstelm_d(dst0, dst_v, 0, 1); + src_argb0 += 64; + src_argb1 += 64; + dst_u += 8; + dst_v += 8; + } +} + +void ARGBToRGB24Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 16) - 1; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i shuf = {0x0908060504020100, 0x000000000E0D0C0A}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + __lsx_vst(tmp3, dst_rgb, 36); + dst_rgb += 48; + src_argb += 64; + } + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + dst_rgb += 36; + __lsx_vst(tmp3, dst_rgb, 0); +} + +void ARGBToRAWRow_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = (width / 16) - 1; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i shuf = {0x090A040506000102, 0x000000000C0D0E08}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + __lsx_vst(tmp3, dst_rgb, 36); + dst_rgb += 48; + src_argb += 64; + } + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vshuf_b(src0, src0, shuf); + tmp1 = __lsx_vshuf_b(src1, src1, shuf); + tmp2 = __lsx_vshuf_b(src2, src2, shuf); + tmp3 = __lsx_vshuf_b(src3, src3, shuf); + __lsx_vst(tmp0, dst_rgb, 0); + __lsx_vst(tmp1, dst_rgb, 12); + __lsx_vst(tmp2, dst_rgb, 24); + dst_rgb += 36; + __lsx_vst(tmp3, dst_rgb, 0); +} + +void ARGBToRGB565Row_LSX(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + int len = width / 8; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, tmp0, tmp1, dst0; + __m128i shift = {0x0300030003000300, 0x0300030003000300}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp0 = __lsx_vsrli_b(tmp0, 3); + tmp1 = __lsx_vpackev_b(zero, tmp1); + tmp1 = __lsx_vsrli_h(tmp1, 2); + tmp0 = __lsx_vsll_b(tmp0, shift); + tmp1 = __lsx_vslli_h(tmp1, 5); + dst0 = __lsx_vor_v(tmp0, tmp1); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToARGB1555Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 8; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, tmp0, tmp1, tmp2, tmp3, dst0; + __m128i shift1 = {0x0703070307030703, 0x0703070307030703}; + __m128i shift2 = {0x0200020002000200, 0x0200020002000200}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp0 = __lsx_vsrli_b(tmp0, 3); + tmp1 = __lsx_vsrl_b(tmp1, shift1); + tmp0 = __lsx_vsll_b(tmp0, shift2); + tmp2 = __lsx_vpackev_b(zero, tmp1); + tmp3 = __lsx_vpackod_b(zero, tmp1); + tmp2 = __lsx_vslli_h(tmp2, 5); + tmp3 = __lsx_vslli_h(tmp3, 15); + dst0 = __lsx_vor_v(tmp0, tmp2); + dst0 = __lsx_vor_v(dst0, tmp3); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToARGB4444Row_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + tmp1 = __lsx_vandi_b(tmp1, 0xF0); + tmp0 = __lsx_vsrli_b(tmp0, 4); + dst0 = __lsx_vor_v(tmp1, tmp0); + __lsx_vst(dst0, dst_rgb, 0); + dst_rgb += 16; + src_argb += 32; + } +} + +void ARGBToUV444Row_LSX(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int x; + int len = width / 16; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, dst0, dst1; + __m128i const_112 = __lsx_vldi(112); + __m128i const_74 = __lsx_vldi(74); + __m128i const_38 = __lsx_vldi(38); + __m128i const_94 = __lsx_vldi(94); + __m128i const_18 = __lsx_vldi(18); + __m128i const_0x8080 = {0x8080808080808080, 0x8080808080808080}; + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, + src0, src1, src2, src3); + tmp0 = __lsx_vpickev_h(src1, src0); + tmp1 = __lsx_vpickod_h(src1, src0); + tmp2 = __lsx_vpickev_h(src3, src2); + tmp3 = __lsx_vpickod_h(src3, src2); + reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112); + reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112); + reg2 = __lsx_vmulwod_h_bu(tmp0, const_74); + reg3 = __lsx_vmulwod_h_bu(tmp2, const_74); + reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38); + reg3 = __lsx_vmaddwev_h_bu(reg3, tmp3, const_38); + reg0 = __lsx_vsub_h(reg0, reg2); + reg1 = __lsx_vsub_h(reg1, reg3); + reg0 = __lsx_vsrai_h(reg0, 8); + reg1 = __lsx_vsrai_h(reg1, 8); + dst0 = __lsx_vpickev_b(reg1, reg0); + + reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112); + reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112); + reg2 = __lsx_vmulwev_h_bu(tmp0, const_18); + reg3 = __lsx_vmulwev_h_bu(tmp2, const_18); + reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94); + reg3 = __lsx_vmaddwod_h_bu(reg3, tmp2, const_94); + reg0 = __lsx_vsub_h(reg0, reg2); + reg1 = __lsx_vsub_h(reg1, reg3); + reg0 = __lsx_vsrai_h(reg0, 8); + reg1 = __lsx_vsrai_h(reg1, 8); + dst1 = __lsx_vpickev_b(reg1, reg0); + + __lsx_vst(dst0, dst_u, 0); + __lsx_vst(dst1, dst_v, 0); + dst_u += 16; + dst_v += 16; + src_argb += 64; + } +} + +void ARGBMultiplyRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i zero = __lsx_vldi(0); + __m128i src0, src1, dst0, dst1; + __m128i tmp0, tmp1, tmp2, tmp3; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + tmp0 = __lsx_vilvl_b(src0, src0); + tmp1 = __lsx_vilvh_b(src0, src0); + tmp2 = __lsx_vilvl_b(zero, src1); + tmp3 = __lsx_vilvh_b(zero, src1); + dst0 = __lsx_vmuh_hu(tmp0, tmp2); + dst1 = __lsx_vmuh_hu(tmp1, tmp3); + dst0 = __lsx_vpickev_b(dst1, dst0); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lsx_vsadd_bu(src0, src1); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBSubtractRow_LSX(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 4; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb0, 0, src_argb1, 0, src0, src1); + dst0 = __lsx_vssub_bu(src0, src1); + __lsx_vst(dst0, dst_argb, 0); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAttenuateRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i b, g, r, a, dst0, dst1; + __m128i control = {0x0005000100040000, 0x0007000300060002}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + b = __lsx_vpackev_b(tmp0, tmp0); + r = __lsx_vpackod_b(tmp0, tmp0); + g = __lsx_vpackev_b(tmp1, tmp1); + a = __lsx_vpackod_b(tmp1, tmp1); + reg0 = __lsx_vmulwev_w_hu(b, a); + reg1 = __lsx_vmulwod_w_hu(b, a); + reg2 = __lsx_vmulwev_w_hu(r, a); + reg3 = __lsx_vmulwod_w_hu(r, a); + reg4 = __lsx_vmulwev_w_hu(g, a); + reg5 = __lsx_vmulwod_w_hu(g, a); + reg0 = __lsx_vssrani_h_w(reg1, reg0, 24); + reg2 = __lsx_vssrani_h_w(reg3, reg2, 24); + reg4 = __lsx_vssrani_h_w(reg5, reg4, 24); + reg0 = __lsx_vshuf_h(control, reg0, reg0); + reg2 = __lsx_vshuf_h(control, reg2, reg2); + reg4 = __lsx_vshuf_h(control, reg4, reg4); + tmp0 = __lsx_vpackev_b(reg4, reg0); + tmp1 = __lsx_vpackev_b(a, reg2); + dst0 = __lsx_vilvl_h(tmp1, tmp0); + dst1 = __lsx_vilvh_h(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; + src_argb += 32; + } +} + +void ARGBToRGB565DitherRow_LSX(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1, dst0; + __m128i b, g, r; + __m128i zero = __lsx_vldi(0); + __m128i vec_dither = __lsx_vldrepl_w(&dither4, 0); + + vec_dither = __lsx_vilvl_b(zero, vec_dither); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + b = __lsx_vpackev_b(zero, tmp0); + r = __lsx_vpackod_b(zero, tmp0); + g = __lsx_vpackev_b(zero, tmp1); + b = __lsx_vadd_h(b, vec_dither); + g = __lsx_vadd_h(g, vec_dither); + r = __lsx_vadd_h(r, vec_dither); + DUP2_ARG1(__lsx_vclip255_h, b, g, b, g); + r = __lsx_vclip255_h(r); + b = __lsx_vsrai_h(b, 3); + g = __lsx_vsrai_h(g, 2); + r = __lsx_vsrai_h(r, 3); + g = __lsx_vslli_h(g, 5); + r = __lsx_vslli_h(r, 11); + dst0 = __lsx_vor_v(b, g); + dst0 = __lsx_vor_v(dst0, r); + __lsx_vst(dst0, dst_rgb, 0); + src_argb += 32; + dst_rgb += 16; } +} -#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ - { \ - __m128i _tmp0, _tmp1, _tmp2, _tmp3; \ - __m128i _reg0, _reg1; \ - _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \ - _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \ - _tmp2 = __lsx_vaddwev_h_bu(_tmpg, _nexg); \ - _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \ - _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \ - _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \ - _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \ - _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \ - _tmpr = __lsx_vavgr_hu(_reg0, _reg1); \ - _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \ - _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \ - _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \ - _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \ - _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \ - _reg1 = __lsx_vmsub_h(_reg1, const_18, _tmpb); \ - _dst0 = __lsx_vpickod_b(_reg1, _reg0); \ +void ARGBShuffleRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + int len = width / 8; + __m128i src0, src1, dst0, dst1; + __m128i shuf = {0x0404040400000000, 0x0C0C0C0C08080808}; + __m128i temp = __lsx_vldrepl_w(shuffler, 0); + + shuf = __lsx_vadd_b(shuf, temp); + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + dst0 = __lsx_vshuf_b(src0, src0, shuf); + dst1 = __lsx_vshuf_b(src1, src1, shuf); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBShadeRow_LSX(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + int len = width / 4; + __m128i src0, dst0, tmp0, tmp1; + __m128i vec_value = __lsx_vreplgr2vr_w(value); + + vec_value = __lsx_vilvl_b(vec_value, vec_value); + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_argb, 0); + tmp0 = __lsx_vilvl_b(src0, src0); + tmp1 = __lsx_vilvh_b(src0, src0); + tmp0 = __lsx_vmuh_hu(tmp0, vec_value); + tmp1 = __lsx_vmuh_hu(tmp1, vec_value); + dst0 = __lsx_vpickod_b(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 16; + dst_argb += 16; + } +} + +void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, reg2, dst0, dst1; + __m128i const_128 = __lsx_vldi(0x480); + __m128i const_150 = __lsx_vldi(0x96); + __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + reg0 = __lsx_vdp2_h_bu(tmp0, const_br); + reg1 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150); + reg2 = __lsx_vadd_h(reg0, reg1); + tmp0 = __lsx_vpackod_b(reg2, reg2); + tmp1 = __lsx_vpackod_b(tmp1, reg2); + dst0 = __lsx_vilvl_h(tmp1, tmp0); + dst1 = __lsx_vilvh_h(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width) { + int x; + int len = width / 8; + __m128i src0, src1, tmp0, tmp1; + __m128i reg0, reg1, spb, spg, spr; + __m128i dst0, dst1; + __m128i spb_g = __lsx_vldi(68); + __m128i spg_g = __lsx_vldi(88); + __m128i spr_g = __lsx_vldi(98); + __m128i spb_br = {0x2311231123112311, 0x2311231123112311}; + __m128i spg_br = {0x2D162D162D162D16, 0x2D162D162D162D16}; + __m128i spr_br = {0x3218321832183218, 0x3218321832183218}; + __m128i shuff = {0x1706150413021100, 0x1F0E1D0C1B0A1908}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, dst_argb, 0, dst_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_b(src1, src0); + tmp1 = __lsx_vpickod_b(src1, src0); + DUP2_ARG2(__lsx_vdp2_h_bu, tmp0, spb_br, tmp0, spg_br, spb, spg); + spr = __lsx_vdp2_h_bu(tmp0, spr_br); + spb = __lsx_vmaddwev_h_bu(spb, tmp1, spb_g); + spg = __lsx_vmaddwev_h_bu(spg, tmp1, spg_g); + spr = __lsx_vmaddwev_h_bu(spr, tmp1, spr_g); + spb = __lsx_vsrli_h(spb, 7); + spg = __lsx_vsrli_h(spg, 7); + spr = __lsx_vsrli_h(spr, 7); + spg = __lsx_vsat_hu(spg, 7); + spr = __lsx_vsat_hu(spr, 7); + reg0 = __lsx_vpackev_b(spg, spb); + reg1 = __lsx_vshuf_b(tmp1, spr, shuff); + dst0 = __lsx_vilvl_h(reg1, reg0); + dst1 = __lsx_vilvh_h(reg1, reg0); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; } +} void ARGB4444ToARGBRow_LSX(const uint8_t* src_argb4444, uint8_t* dst_argb, @@ -561,39 +1687,6 @@ void RGB565ToUVRow_LSX(const uint8_t* src_rgb565, } } -void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1, dst0; - __m128i const_129 = __lsx_vldi(129); - __m128i const_br = {0x4219421942194219, 0x4219421942194219}; - __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C}; - __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604}; - __m128i shuff2 = {0x000A000700040001, 0x001600130010000D}; - __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005}; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_rgb24, 0); - src1 = __lsx_vld(src_rgb24, 16); - src2 = __lsx_vld(src_rgb24, 32); - tmp0 = __lsx_vshuf_b(src1, src0, shuff0); - tmp1 = __lsx_vshuf_b(src1, src2, shuff1); - tmp2 = __lsx_vshuf_b(src1, src0, shuff2); - tmp3 = __lsx_vshuf_b(src1, src2, shuff3); - reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129); - reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129); - reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1); - dst0 = __lsx_vpickod_b(reg1, reg0); - __lsx_vst(dst0, dst_y, 0); - dst_y += 16; - src_rgb24 += 48; - } -} - void RGB24ToUVRow_LSX(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -647,39 +1740,6 @@ void RGB24ToUVRow_LSX(const uint8_t* src_rgb24, } } -void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1, dst0; - __m128i const_129 = __lsx_vldi(129); - __m128i const_br = {0x1942194219421942, 0x1942194219421942}; - __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - __m128i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C}; - __m128i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604}; - __m128i shuff2 = {0x000A000700040001, 0x001600130010000D}; - __m128i shuff3 = {0x0002001F001C0019, 0x000E000B00080005}; - - for (x = 0; x < len; x++) { - src0 = __lsx_vld(src_raw, 0); - src1 = __lsx_vld(src_raw, 16); - src2 = __lsx_vld(src_raw, 32); - tmp0 = __lsx_vshuf_b(src1, src0, shuff0); - tmp1 = __lsx_vshuf_b(src1, src2, shuff1); - tmp2 = __lsx_vshuf_b(src1, src0, shuff2); - tmp3 = __lsx_vshuf_b(src1, src2, shuff3); - reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp2, const_129); - reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129); - reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp1); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); - __lsx_vst(dst0, dst_y, 0); - dst_y += 16; - src_raw += 48; - } -} - void RAWToUVRow_LSX(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, @@ -914,62 +1974,6 @@ void SobelXYRow_LSX(const uint8_t* src_sobelx, } } -void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2, src3, dst0; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1; - __m128i const_128 = __lsx_vldi(0x480); - __m128i const_150 = __lsx_vldi(0x96); - __m128i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmp2 = __lsx_vpickev_b(src3, src2); - tmp3 = __lsx_vpickod_b(src3, src2); - reg0 = __lsx_vmaddwev_h_bu(const_128, tmp1, const_150); - reg1 = __lsx_vmaddwev_h_bu(const_128, tmp3, const_150); - reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); - dst0 = __lsx_vpickod_b(reg1, reg0); - __lsx_vst(dst0, dst_y, 0); - dst_y += 16; - src_argb += 64; - } -} - -void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2, src3, dst0; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1; - __m128i const_129 = __lsx_vldi(0x81); - __m128i const_br = {0x1942194219421942, 0x1942194219421942}; - __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vpickod_b(src1, src0); - tmp1 = __lsx_vpickev_b(src1, src0); - tmp2 = __lsx_vpickod_b(src3, src2); - tmp3 = __lsx_vpickev_b(src3, src2); - reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129); - reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129); - reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); - __lsx_vst(dst0, dst_y, 0); - dst_y += 16; - src_bgra += 64; - } -} - void BGRAToUVRow_LSX(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -1018,34 +2022,6 @@ void BGRAToUVRow_LSX(const uint8_t* src_bgra, } } -void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2, src3, dst0; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1; - __m128i const_129 = __lsx_vldi(0x81); - __m128i const_br = {0x1942194219421942, 0x1942194219421942}; - __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vpickev_b(src1, src0); - tmp1 = __lsx_vpickod_b(src1, src0); - tmp2 = __lsx_vpickev_b(src3, src2); - tmp3 = __lsx_vpickod_b(src3, src2); - reg0 = __lsx_vmaddwev_h_bu(const_1080, tmp1, const_129); - reg1 = __lsx_vmaddwev_h_bu(const_1080, tmp3, const_129); - reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); - __lsx_vst(dst0, dst_y, 0); - dst_y += 16; - src_abgr += 64; - } -} - void ABGRToUVRow_LSX(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, @@ -1094,34 +2070,6 @@ void ABGRToUVRow_LSX(const uint8_t* src_abgr, } } -void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - int x; - int len = width / 16; - __m128i src0, src1, src2, src3, dst0; - __m128i tmp0, tmp1, tmp2, tmp3; - __m128i reg0, reg1; - __m128i const_129 = __lsx_vldi(0x81); - __m128i const_br = {0x4219421942194219, 0x4219421942194219}; - __m128i const_1080 = {0x1080108010801080, 0x1080108010801080}; - - for (x = 0; x < len; x++) { - DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48, - src0, src1, src2, src3); - tmp0 = __lsx_vpickod_b(src1, src0); - tmp1 = __lsx_vpickev_b(src1, src0); - tmp2 = __lsx_vpickod_b(src3, src2); - tmp3 = __lsx_vpickev_b(src3, src2); - reg0 = __lsx_vmaddwod_h_bu(const_1080, tmp1, const_129); - reg1 = __lsx_vmaddwod_h_bu(const_1080, tmp3, const_129); - reg0 = __lsx_vdp2add_h_bu(reg0, const_br, tmp0); - reg1 = __lsx_vdp2add_h_bu(reg1, const_br, tmp2); - dst0 = __lsx_vsrlni_b_h(reg1, reg0, 8); - __lsx_vst(dst0, dst_y, 0); - dst_y += 16; - src_rgba += 64; - } -} - void RGBAToUVRow_LSX(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, @@ -1821,6 +2769,216 @@ void HalfFloatRow_LSX(const uint16_t* src, } } +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants + "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants + "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants + "1: \n\t" + "vld $vr4, %0, 0 \n\t" + "vld $vr5, %0, 16 \n\t" + "vld $vr6, %0, 32 \n\t" + "vld $vr7, %0, 48 \n\t" // load 16 pixels of + // ARGB + "vor.v $vr12, $vr3, $vr3 \n\t" + "vor.v $vr13, $vr3, $vr3 \n\t" + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vpickev.b $vr8, $vr5, $vr4 \n\t" // BR + "vpickev.b $vr10, $vr7, $vr6 \n\t" + "vpickod.b $vr9, $vr5, $vr4 \n\t" // GA + "vpickod.b $vr11, $vr7, $vr6 \n\t" + "vmaddwev.h.bu $vr12, $vr8, $vr0 \n\t" // B + "vmaddwev.h.bu $vr13, $vr10, $vr0 \n\t" + "vmaddwev.h.bu $vr12, $vr9, $vr1 \n\t" // G + "vmaddwev.h.bu $vr13, $vr11, $vr1 \n\t" + "vmaddwod.h.bu $vr12, $vr8, $vr2 \n\t" // R + "vmaddwod.h.bu $vr13, $vr10, $vr2 \n\t" + "addi.d %0, %0, 64 \n\t" + "vpickod.b $vr10, $vr13, $vr12 \n\t" + "vst $vr10, %1, 0 \n\t" + "addi.d %1, %1, 16 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_argb), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants) + : "memory"); +} + +void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LSX(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_LSX(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LSX(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_LSX(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_LSX(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +// Same code as ARGB, except the LD4 +static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + asm volatile( + "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants + "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants + "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants + "1: \n\t" + "vld $vr4, %0, 0 \n\t" + "vld $vr5, %0, 16 \n\t" + "vld $vr6, %0, 32 \n\t" + "vld $vr7, %0, 48 \n\t" // load 16 pixels of + // RGBA + "vor.v $vr12, $vr3, $vr3 \n\t" + "vor.v $vr13, $vr3, $vr3 \n\t" + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vpickev.b $vr8, $vr5, $vr4 \n\t" // AG + "vpickev.b $vr10, $vr7, $vr6 \n\t" + "vpickod.b $vr9, $vr5, $vr4 \n\t" // BR + "vpickod.b $vr11, $vr7, $vr6 \n\t" + "vmaddwev.h.bu $vr12, $vr9, $vr0 \n\t" // B + "vmaddwev.h.bu $vr13, $vr11, $vr0 \n\t" + "vmaddwod.h.bu $vr12, $vr8, $vr1 \n\t" // G + "vmaddwod.h.bu $vr13, $vr10, $vr1 \n\t" + "vmaddwod.h.bu $vr12, $vr9, $vr2 \n\t" // R + "vmaddwod.h.bu $vr13, $vr11, $vr2 \n\t" + "addi.d %0, %0, 64 \n\t" + "vpickod.b $vr10, $vr13, $vr12 \n\t" + "vst $vr10, %1, 0 \n\t" + "addi.d %1, %1, 16 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants) + : "memory"); +} + +void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LSX(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_LSX(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_LSX(src_bgra, dst_y, width, &kRawI601Constants); +} + +static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, + 20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6, + 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, + 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, + 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; + asm volatile( + "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants + "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants + "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants + "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants + "vld $vr4, %4, 0 \n\t" // load shuff + "vld $vr5, %4, 16 \n\t" + "vld $vr6, %4, 32 \n\t" + "vld $vr7, %4, 48 \n\t" + "1: \n\t" + "vld $vr8, %0, 0 \n\t" + "vld $vr9, %0, 16 \n\t" + "vld $vr10, %0, 32 \n\t" // load 16 pixels of + // RGB + "vor.v $vr12, $vr3, $vr3 \n\t" + "vor.v $vr13, $vr3, $vr3 \n\t" + "addi.d %2, %2, -16 \n\t" // 16 processed per + // loop. + "vshuf.b $vr14, $vr9, $vr8, $vr4 \n\t" + "vshuf.b $vr15, $vr9, $vr10, $vr5 \n\t" + "vshuf.b $vr16, $vr9, $vr8, $vr6 \n\t" + "vshuf.b $vr17, $vr9, $vr10, $vr7 \n\t" + "vmaddwev.h.bu $vr12, $vr16, $vr1 \n\t" // G + "vmaddwev.h.bu $vr13, $vr17, $vr1 \n\t" + "vmaddwev.h.bu $vr12, $vr14, $vr0 \n\t" // B + "vmaddwev.h.bu $vr13, $vr15, $vr0 \n\t" + "vmaddwod.h.bu $vr12, $vr14, $vr2 \n\t" // R + "vmaddwod.h.bu $vr13, $vr15, $vr2 \n\t" + "addi.d %0, %0, 48 \n\t" + "vpickod.b $vr10, $vr13, $vr12 \n\t" + "vst $vr10, %1, 0 \n\t" + "addi.d %1, %1, 16 \n\t" + "bnez %2, 1b \n\t" + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(rgbconstants), // %3 + "r"(shuff) // %4 + : "memory"); +} + +void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants); +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc deleted file mode 100644 index 362fd1cf..00000000 --- a/files/source/row_mmi.cc +++ /dev/null @@ -1,7842 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "libyuv/row.h" - -#include // For memcpy and memset. - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// clang-format off - -void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - const uint64_t mask = 0xff000000ULL; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask] \n\t" - "or %[src1], %[src1], %[mask] \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask] \n\t" - "or %[src1], %[src1], %[mask] \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [mask] "f"(mask) - : "memory"); -} - -void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - uint64_t src0, src1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0xff000000ULL; - const uint64_t mask2 = 0xc6; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask1] \n\t" - "punpcklbh %[src0], %[src0], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask2] \n\t" - "or %[src1], %[src1], %[mask1] \n\t" - "punpcklbh %[src1], %[src1], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask2] \n\t" - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" - - "or %[src0], %[src0], %[mask1] \n\t" - "punpcklbh %[src0], %[src0], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask2] \n\t" - "or %[src1], %[src1], %[mask1] \n\t" - "punpcklbh %[src1], %[src1], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask2] \n\t" - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width) - : "memory"); -} - -void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - uint64_t mask0 = 0xc6; - uint64_t mask1 = 0x6c; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t" - "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t" - "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t" - - "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" - "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" - "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" - "punpcklbh %[src1], %[src1], %[zero] \n\t" - "pextrh %[ftmp2], %[ftmp0], %[three] \n\t" - "pextrh %[ftmp3], %[ftmp1], %[one] \n\t" - "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t" - "pextrh %[ftmp3], %[ftmp1], %[two] \n\t" - "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "pextrh %[ftmp2], %[src1], %[zero] \n\t" - "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t" - "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "packushb %[src1], %[src1], %[zero] \n\t" - - "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t" - "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t" - - "daddiu %[src_raw], %[src_raw], 0x0c \n\t" - "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), - [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]) - : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width), - [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), - [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03) - : "memory"); -} - -void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[5]; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[c1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]) - : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), - [four] "f"(0x04) - : "memory"); -} - -void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[6]; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - uint64_t c4 = 0x0001000100010001; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psrlh %[a], %[src1], %[seven] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "xor %[a], %[a], %[c1] \n\t" - "paddb %[a], %[a], %[c4] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[a] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) - : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05), - [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) - : "memory"); -} - -void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, - uint8_t* dst_argb, - int width) { - uint64_t ftmp[6]; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psrlh %[a], %[src1], %[four] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "psllh %[src0], %[a], %[four] \n\t" - "or %[a], %[src0], %[a] \n\t" - "packushb %[b], %[b], %[r] \n\t" - "packushb %[g], %[g], %[a] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" - "punpckhhw %[r], %[src0], %[src1] \n\t" - "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" - "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t" - "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), - [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) - : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08), - [four] "f"(0x04) - : "memory"); -} - -void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t" - - "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t" - "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t" - "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width) - : "memory"); -} - -void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - uint64_t mask0 = 0xc6; - uint64_t mask1 = 0x18; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" - "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" - "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" - "punpcklbh %[ftmp2], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - - "pextrh %[src0], %[ftmp1], %[two] \n\t" - "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t" - "pshufh %[ftmp1], %[ftmp1], %[one] \n\t" - - "pextrh %[src0], %[ftmp2], %[two] \n\t" - "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t" - "pextrh %[src0], %[ftmp2], %[one] \n\t" - "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t" - "pextrh %[src0], %[ftmp2], %[zero] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "pinsrh_0 %[src1], %[src1], %[src0] \n\t" - "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" - "packushb %[src1], %[src1], %[zero] \n\t" - - "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), - [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), - [one] "f"(0x01), [two] "f"(0x02) - : "memory"); -} - -void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[two] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[eleven] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05), - [eleven] "f"(0x0b) - : "memory"); -} - -// dither4 is a row of 4 values from 4x4 dither matrix. -// The 4x4 matrix contains values to increase RGB. When converting to -// fewer bits (565) this provides an ordered dither. -// The order in the 4x4 matrix in first byte is upper left. -// The 4 values are passed as an int, then referenced as an array, so -// endian will not affect order of the original matrix. But the dither4 -// will containing the first pixel in the lower byte for little endian -// or the upper byte for big endian. -void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - const uint32_t dither4, - int width) { - uint64_t src0, src1; - uint64_t ftmp[3]; - uint64_t c0 = 0x00ff00ff00ff00ff; - - __asm__ volatile( - "punpcklbh %[dither], %[dither], %[zero] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - - "paddh %[b], %[b], %[dither] \n\t" - "paddh %[g], %[g], %[dither] \n\t" - "paddh %[r], %[r], %[dither] \n\t" - "pcmpgth %[src0], %[b], %[c0] \n\t" - "or %[src0], %[src0], %[b] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "pcmpgth %[src0], %[g], %[c0] \n\t" - "or %[src0], %[src0], %[g] \n\t" - "and %[g], %[src0], %[c0] \n\t" - "pcmpgth %[src0], %[r], %[c0] \n\t" - "or %[src0], %[src0], %[r] \n\t" - "and %[r], %[src0], %[c0] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[two] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[eleven] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02), - [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b) - : "memory"); -} - -void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - "punpckhbh %[a], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[three] \n\t" - "psrlh %[g], %[g], %[three] \n\t" - "psrlh %[r], %[r], %[three] \n\t" - "psrlh %[a], %[a], %[seven] \n\t" - - "psllh %[g], %[g], %[five] \n\t" - "psllh %[r], %[r], %[ten] \n\t" - "psllh %[a], %[a], %[fifteen] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - "or %[b], %[b], %[a] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05), - [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f) - : "memory"); -} - -void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width) { - uint64_t src0, src1; - uint64_t ftmp[4]; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" - - "punpcklbh %[b], %[src0], %[src1] \n\t" - "punpckhbh %[g], %[src0], %[src1] \n\t" - "punpcklbh %[src0], %[b], %[g] \n\t" - "punpckhbh %[src1], %[b], %[g] \n\t" - "punpcklbh %[b], %[src0], %[zero] \n\t" - "punpckhbh %[g], %[src0], %[zero] \n\t" - "punpcklbh %[r], %[src1], %[zero] \n\t" - "punpckhbh %[a], %[src1], %[zero] \n\t" - - "psrlh %[b], %[b], %[four] \n\t" - "psrlh %[g], %[g], %[four] \n\t" - "psrlh %[r], %[r], %[four] \n\t" - "psrlh %[a], %[a], %[four] \n\t" - - "psllh %[g], %[g], %[four] \n\t" - "psllh %[r], %[r], %[eight] \n\t" - "psllh %[a], %[a], %[twelve] \n\t" - "or %[b], %[b], %[g] \n\t" - "or %[b], %[b], %[r] \n\t" - "or %[b], %[b], %[a] \n\t" - - "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" - "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x10 \n\t" - "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" - "daddiu %[width], %[width], -0x04 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), - [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) - : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), - [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08), - [twelve] "f"(0x0c) - : "memory"); -} - -void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001004200810019; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void ARGBToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0019008100420001; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void BGRAToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002f00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsrl %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001001900810042; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void ABGRToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002F00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" - "dsll %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" - "dsll %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" - "dsll %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" - "dsll %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0042008100190001; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RGBAToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" - "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" - "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" - "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" - "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" - "dsrl %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001004200810019; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x18 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RGB24ToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0013002500380002; - const uint64_t mask_v = 0x00020038002f0009; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest0, dest1, dest2, dest3; - const uint64_t value = 0x1080; - const uint64_t mask = 0x0001001900810042; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[dest0], %[src] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[dest1], %[src] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[dest2], %[src] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src], %[zero] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "dsll %[src], %[src], %[eight] \n\t" - "punpckhbh %[src_hi], %[src], %[zero] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[dest3], %[src] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x18 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3) - : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), - [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), - [zero] "f"(0x00) - : "memory"); -} - -void RAWToUVRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[13]; - uint64_t tmp[1]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0002003800250013; - const uint64_t mask_v = 0x0009002f00380002; - - __asm__ volatile( - "dli %[tmp0], 0x0001000100010001 \n\t" - "dmtc1 %[tmp0], %[ftmp12] \n\t" - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" - "dsll %[dest0_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src1], %[src0] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src0], %[src1] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" - "dsll %[dest1_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src1], %[src0] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src0], %[src1] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" - "dsll %[dest2_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src1], %[src0] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src0], %[src1] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" - "dsll %[dest3_v], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "dsll %[src0], %[src0], %[eight] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src_hi] \n\t" - "punpcklbh %[src_lo], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_lo] \n\t" - "dsll %[src1], %[src1], %[eight] \n\t" - "punpckhbh %[src_hi], %[src1], %[zero] \n\t" - "paddh %[src0], %[src0], %[src_hi] \n\t" - "paddh %[src0], %[src0], %[ftmp12] \n\t" - "psrlh %[src0], %[src0], %[one] \n\t" - "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" - "dsll %[src_hi], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src1], %[src0] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src0], %[src1] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), - [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), - [sixteen] "f"(0x10) - : "memory"); -} - -void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { - uint64_t src, src_hi, src_lo; - uint64_t dest, dest0, dest1, dest2, dest3; - uint64_t tmp0, tmp1; - const uint64_t shift = 0x08; - const uint64_t value = 0x80; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x0001004D0096001DULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - - "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - - "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest2], %[dest2], %[shift] \n\t" - - "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" - "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest3], %[dest3], %[shift] \n\t" - - "packsswh %[tmp0], %[dest0], %[dest1] \n\t" - "packsswh %[tmp1], %[dest2], %[dest3] \n\t" - "packushb %[dest], %[tmp0], %[tmp1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), - [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), - [tmp1] "=&f"(tmp1) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), - [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), - [width] "r"(width) - : "memory"); -} - -void ARGBToUVJRow_MMI(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src_rgb1; - uint64_t ftmp[12]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0015002a003f0002; - const uint64_t mask_v = 0x0002003f0035000a; - - __asm__ volatile( - "1: \n\t" - "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest0_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest1_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest2_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[dest3_u], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - - "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" - "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" - "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "punpcklbh %[src0], %[src1], %[zero] \n\t" - "punpckhbh %[src1], %[src1], %[zero] \n\t" - "paddh %[src0], %[src_lo], %[src0] \n\t" - "paddh %[src1], %[src_hi], %[src1] \n\t" - "pavgh %[src0], %[src0], %[src1] \n\t" - "dsll %[src_lo], %[src0], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), - [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), - [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), - [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), - [sixteen] "f"(0x10) - : "memory"); -} - -void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - uint64_t ftmp[11]; - const uint64_t value = 0x1080108010801080; - const uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psrlh %[r], %[src1], %[three] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[two] \n\t" - "psrlh %[src1], %[g], %[four] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05), - [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04) - : "memory"); -} - -void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, - uint8_t* dst_y, - int width) { - uint64_t ftmp[11]; - const uint64_t value = 0x1080108010801080; - const uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g], %[src1], %[c2] \n\t" - "psllh %[g], %[g], %[three] \n\t" - "or %[g], %[src0], %[g] \n\t" - "and %[r], %[src1], %[c3] \n\t" - "psrlh %[r], %[r], %[two] \n\t" - "psllh %[src0], %[b], %[three] \n\t" - "psrlh %[src1], %[b], %[two] \n\t" - "or %[b], %[src0], %[src1] \n\t" - "psllh %[src0], %[g], %[three] \n\t" - "psrlh %[src1], %[g], %[two] \n\t" - "or %[g], %[src0], %[src1] \n\t" - "psllh %[src0], %[r], %[three] \n\t" - "psrlh %[src1], %[r], %[two] \n\t" - "or %[r], %[src0], %[src1] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y), - [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), - [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08), - [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) - : "memory"); -} - -void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, - uint8_t* dst_y, - int width) { - uint64_t ftmp[11]; - uint64_t value = 0x1080108010801080; - uint64_t mask = 0x0001004200810019; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest0], %[src0], %[src1] \n\t" - "psrlw %[dest0], %[dest0], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest1], %[src0], %[src1] \n\t" - "psrlw %[dest1], %[dest1], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" - "psrlh %[src1], %[src0], %[eight] \n\t" - "and %[b], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g], %[src0], %[four] \n\t" - "and %[r], %[src1], %[c0] \n\t" - "psllh %[src0], %[b], %[four] \n\t" - "or %[b], %[src0], %[b] \n\t" - "psllh %[src0], %[g], %[four] \n\t" - "or %[g], %[src0], %[g] \n\t" - "psllh %[src0], %[r], %[four] \n\t" - "or %[r], %[src0], %[r] \n\t" - "punpcklhw %[src0], %[b], %[r] \n\t" - "punpcklhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest2], %[src0], %[src1] \n\t" - "psrlw %[dest2], %[dest2], %[eight] \n\t" - - "punpckhhw %[src0], %[b], %[r] \n\t" - "punpckhhw %[src1], %[g], %[value] \n\t" - "punpcklhw %[src_lo], %[src0], %[src1] \n\t" - "punpckhhw %[src_hi], %[src0], %[src1] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" - "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" - "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" - "paddw %[dest3], %[src0], %[src1] \n\t" - "psrlw %[dest3], %[dest3], %[eight] \n\t" - - "packsswh %[src_lo], %[dest0], %[dest1] \n\t" - "packsswh %[src_hi], %[dest2], %[dest3] \n\t" - "packushb %[dest0], %[src_lo], %[src_hi] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - - "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t" - "daddiu %[dst_y], %[dst_y], 0x08 \n\t" - "daddiu %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), - [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), - [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) - : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y), - [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), - [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04) - : "memory"); -} - -void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[13]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0007000700070007; - __asm__ volatile( - "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest0_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest0_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest1_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest1_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest2_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest2_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest2_v], %[src0], %[c2] \n\t" - "psllh %[dest2_v], %[dest2_v], %[three] \n\t" - "or %[dest2_v], %[src1], %[dest2_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest2_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest3_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "psrlh %[r0], %[dest3_u], %[three] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest3_v], %[src0], %[c2] \n\t" - "psllh %[dest3_v], %[dest3_v], %[three] \n\t" - "or %[dest3_v], %[src1], %[dest3_v] \n\t" - "psrlh %[src0], %[src0], %[three] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest3_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t" - "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), - [dest3_v] "=&f"(ftmp[12]) - : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), - [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), - [one] "f"(0x01) - : "memory"); -} - -void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[11]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x001f001f001f001f; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t c2 = 0x0003000300030003; - uint64_t c3 = 0x007c007c007c007c; - __asm__ volatile( - "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest0_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest0_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest1_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest1_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest2_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest2_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest0_v], %[src0], %[c2] \n\t" - "psllh %[dest0_v], %[dest0_v], %[three] \n\t" - "or %[dest0_v], %[src1], %[dest0_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[src0], %[src0], %[five] \n\t" - "and %[g0], %[dest3_u], %[c2] \n\t" - "psllh %[g0], %[g0], %[three] \n\t" - "or %[g0], %[src0], %[g0] \n\t" - "and %[r0], %[dest3_u], %[c3] \n\t" - "psrlh %[r0], %[r0], %[two] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[src1], %[src1], %[five] \n\t" - "and %[dest1_v], %[src0], %[c2] \n\t" - "psllh %[dest1_v], %[dest1_v], %[three] \n\t" - "or %[dest1_v], %[src1], %[dest1_v] \n\t" - "and %[src0], %[src0], %[c3] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[six] \n\t" - "psllh %[r0], %[src0], %[one] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[six] \n\t" - "psllh %[g0], %[g0], %[one] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[dest0_u], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t" - "packushb %[dest0_v], %[dest1_u], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t" - "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]) - : [src_argb1555] "r"(src_argb1555), - [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u), - [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), - [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), - [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), - [two] "f"(0x02), [one] "f"(0x01) - : "memory"); -} - -void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[13]; - uint64_t value = 0x2020202020202020; - uint64_t mask_u = 0x0026004a00700002; - uint64_t mask_v = 0x00020070005e0012; - uint64_t mask = 0x93; - uint64_t c0 = 0x000f000f000f000f; - uint64_t c1 = 0x00ff00ff00ff00ff; - __asm__ volatile( - "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t" - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t" - "psrlh %[dest0_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest0_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest0_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest0_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest0_u] \n\t" - "paddh %[g0], %[g0], %[dest0_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest0_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t" - "psrlh %[dest1_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest1_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest1_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest1_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest1_u] \n\t" - "paddh %[g0], %[g0], %[dest1_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest1_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t" - "psrlh %[dest2_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest2_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest2_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest2_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest2_u] \n\t" - "paddh %[g0], %[g0], %[dest2_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest2_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t" - "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t" - "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t" - "psrlh %[dest3_u], %[src0], %[eight] \n\t" - "and %[b0], %[src0], %[c0] \n\t" - "and %[src0], %[src0], %[c1] \n\t" - "psrlh %[g0], %[src0], %[four] \n\t" - "and %[r0], %[dest3_u], %[c0] \n\t" - "psrlh %[src0], %[src1], %[eight] \n\t" - "and %[dest3_u], %[src1], %[c0] \n\t" - "and %[src1], %[src1], %[c1] \n\t" - "psrlh %[dest3_v], %[src1], %[four] \n\t" - "and %[src0], %[src0], %[c0] \n\t" - "paddh %[b0], %[b0], %[dest3_u] \n\t" - "paddh %[g0], %[g0], %[dest3_v] \n\t" - "paddh %[r0], %[r0], %[src0] \n\t" - "punpcklhw %[src0], %[b0], %[r0] \n\t" - "punpckhhw %[src1], %[b0], %[r0] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" - "psrlh %[b0], %[src0], %[four] \n\t" - "psllh %[r0], %[src0], %[two] \n\t" - "or %[b0], %[b0], %[r0] \n\t" - "psrlh %[r0], %[g0], %[four] \n\t" - "psllh %[g0], %[g0], %[two] \n\t" - "or %[g0], %[g0], %[r0] \n\t" - "punpcklhw %[src0], %[g0], %[value] \n\t" - "punpckhhw %[src1], %[g0], %[value] \n\t" - "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" - "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" - "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" - "punpcklhw %[src0], %[b0], %[g0] \n\t" - "punpckhhw %[src1], %[b0], %[g0] \n\t" - - "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" - "pshufh %[dest3_u], %[src0], %[mask] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[g0], %[src1], %[mask_v] \n\t" - "pshufh %[b0], %[src1], %[mask] \n\t" - "pmaddhw %[b0], %[b0], %[mask_u] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" - "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" - "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t" - "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddiu %[width], %[width], -0x10 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), - [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), - [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), - [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), - [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), - [dest3_v] "=&f"(ftmp[12]) - : [src_argb4444] "r"(src_argb4444), - [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u), - [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), - [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u), - [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04), - [two] "f"(0x02) - : "memory"); -} - -void ARGBToUV444Row_MMI(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t ftmp[12]; - const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" - "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" - "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" - "psubw %[dest0_u], %[src0], %[src1] \n\t" - "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" - "psubw %[dest0_v], %[src1], %[src0] \n\t" - "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" - "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" - "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" - "psubw %[dest1_u], %[src0], %[src1] \n\t" - "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" - "psubw %[dest1_v], %[src1], %[src0] \n\t" - "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" - "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" - "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" - "psubw %[dest2_u], %[src0], %[src1] \n\t" - "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" - "psubw %[dest2_v], %[src1], %[src0] \n\t" - "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - - "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t" - "punpcklbh %[src_lo], %[src0], %[zero] \n\t" - "punpckhbh %[src_hi], %[src0], %[zero] \n\t" - "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t" - "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" - "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t" - "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" - "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" - "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" - "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" - "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" - - "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" - "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" - "psubw %[dest3_u], %[src0], %[src1] \n\t" - "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" - "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" - "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" - "psubw %[dest3_v], %[src1], %[src0] \n\t" - "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" - - "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" - "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" - "packushb %[dest0_u], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" - "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" - - "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" - "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" - "packushb %[dest0_v], %[src0], %[src1] \n\t" - "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" - "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - - "daddiu %[src_argb], %[src_argb], 0x20 \n\t" - "daddiu %[dst_u], %[dst_u], 0x08 \n\t" - "daddiu %[dst_v], %[dst_v], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bgtz %[width], 1b \n\t" - : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), - [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), - [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), - [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), - [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), - [dest3_v] "=&f"(ftmp[11]) - : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), - [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10), - [eight] "f"(0x08) - : "memory"); -} - -void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi; - uint64_t tmp0, tmp1; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x01; - const uint64_t mask2 = 0x0080004D0096001DULL; - const uint64_t mask3 = 0xFF000000FF000000ULL; - const uint64_t mask4 = ~mask3; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "and %[src37], %[src], %[mask3] \n\t" - - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t" - "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t" - "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t" - - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t" - "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t" - "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t" - "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask4] \n\t" - "or %[dest], %[dest], %[src37] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0), - [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest), - [src37] "=&f"(src37) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4) - : "memory"); -} - -// Convert a row of image to Sepia tone. -void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) { - uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2; - uint64_t tmp0, tmp1; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x002300440011ULL; - const uint64_t mask2 = 0x002D00580016ULL; - const uint64_t mask3 = 0x003200620018ULL; - const uint64_t mask4 = 0xFF000000FF000000ULL; - const uint64_t shift = 0x07; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "and %[dest37], %[dest], %[mask4] \n\t" - - "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t" - "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t" - "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t" - "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t" - "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" - "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" - "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - - "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t" - "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t" - "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t" - "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t" - "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" - "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" - "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "or %[dest], %[dest], %[dest37] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), - [dest] "=&f"(dest) - : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), - [mask4] "f"(mask4), [shift] "f"(shift) - : "memory"); -} - -// Apply color matrix to a row of image. Matrix is signed. -// TODO(fbarchard): Consider adding rounding (+32). -void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - const int8_t* matrix_argb, - int width) { - uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2, - dest3; - uint64_t matrix, matrix_hi, matrix_lo; - uint64_t tmp0, tmp1; - const uint64_t shift0 = 0x06; - const uint64_t shift1 = 0x08; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - - "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest0], %[tmp0], %[tmp1] \n\t" - "psraw %[dest0], %[dest0], %[shift0] \n\t" - - "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest1], %[tmp0], %[tmp1] \n\t" - "psraw %[dest1], %[dest1], %[shift0] \n\t" - - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - - "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest2], %[tmp0], %[tmp1] \n\t" - "psraw %[dest2], %[dest2], %[shift0] \n\t" - - "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" - "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" - "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" - "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" - "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" - "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" - "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" - "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" - "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" - "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" - "paddw %[dest3], %[tmp0], %[tmp1] \n\t" - "psraw %[dest3], %[dest3], %[shift0] \n\t" - - "packsswh %[tmp0], %[dest0], %[dest1] \n\t" - "packsswh %[tmp1], %[dest2], %[dest3] \n\t" - "packushb %[dest], %[tmp0], %[tmp1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest), - [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi), - [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix) - : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb), - [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0), - [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1) - : "memory"); -} - -void ARGBShadeRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - int width, - uint32_t value) { - uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[src] \n\t" - "punpckhbh %[src_hi], %[src], %[src] \n\t" - - "punpcklbh %[value], %[value], %[value] \n\t" - - "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src), - [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), - [value] "f"(value), [shift] "f"(shift) - : "memory"); -} - -void ARGBMultiplyRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo; - uint64_t dest, dest_lo, dest_hi; - const uint64_t mask = 0x0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[src0] \n\t" - "punpckhbh %[src0_hi], %[src0], %[src0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask] \n\t" - - "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t" - "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), - [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) - : "memory"); -} - -void ARGBAddRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "paddusb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); -} - -void ARGBSubtractRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "psubusb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); -} - -// Sobel functions which mimics SSSE3. -void SobelXRow_MMI(const uint8_t* src_y0, - const uint8_t* src_y1, - const uint8_t* src_y2, - uint8_t* dst_sobelx, - int width) { - uint64_t y00 = 0, y10 = 0, y20 = 0; - uint64_t y02 = 0, y12 = 0, y22 = 0; - uint64_t zero = 0x0; - uint64_t sobel = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] - "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2] - "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i] - "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2] - "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" - - "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i] - "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t" - "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2] - "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y20], %[y20], %[zero] \n\t" - - "punpcklbh %[y02], %[y02], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - "punpcklbh %[y22], %[y22], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y10] \n\t" // a+b - "paddh %[y20], %[y20], %[y10] \n\t" // c+b - "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c - - "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub - "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub - "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub - - "pmaxsh %[y10], %[y00], %[y02] \n\t" - "pminsh %[y20], %[y00], %[y02] \n\t" - "psubh %[sobel], %[y10], %[y20] \n\t" // Abs - - "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" - "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" - "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" - "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" - "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" - - "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t" - "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t" - "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t" - "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y20], %[y20], %[zero] \n\t" - - "punpcklbh %[y02], %[y02], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - "punpcklbh %[y22], %[y22], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y10] \n\t" - "paddh %[y20], %[y20], %[y10] \n\t" - "paddh %[y00], %[y00], %[y20] \n\t" - - "paddh %[y02], %[y02], %[y12] \n\t" - "paddh %[y22], %[y22], %[y12] \n\t" - "paddh %[y02], %[y02], %[y22] \n\t" - - "pmaxsh %[y10], %[y00], %[y02] \n\t" - "pminsh %[y20], %[y00], %[y02] \n\t" - "psubh %[y00], %[y10], %[y20] \n\t" - - "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 - "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t" - "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t" - - "daddiu %[src_y0], %[src_y0], 8 \n\t" - "daddiu %[src_y1], %[src_y1], 8 \n\t" - "daddiu %[src_y2], %[src_y2], 8 \n\t" - "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10), - [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22) - : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2), - [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero) - : "memory"); -} - -void SobelYRow_MMI(const uint8_t* src_y0, - const uint8_t* src_y1, - uint8_t* dst_sobely, - int width) { - uint64_t y00 = 0, y01 = 0, y02 = 0; - uint64_t y10 = 0, y11 = 0, y12 = 0; - uint64_t zero = 0x0; - uint64_t sobel = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] - "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" - "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1] - "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2] - "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i] - "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" - "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1] - "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2] - "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y01], %[y01], %[zero] \n\t" - "punpcklbh %[y02], %[y02], %[zero] \n\t" - - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y11], %[y11], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y01] \n\t" // a+b - "paddh %[y02], %[y02], %[y01] \n\t" // c+b - "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c - - "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub - "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub - "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub - - "pmaxsh %[y02], %[y00], %[y10] \n\t" - "pminsh %[y12], %[y00], %[y10] \n\t" - "psubh %[sobel], %[y02], %[y12] \n\t" // Abs - - "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" - "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" - "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t" - "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t" - "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" - "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" - - "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" - "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" - "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t" - "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t" - "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" - "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" - - "punpcklbh %[y00], %[y00], %[zero] \n\t" - "punpcklbh %[y01], %[y01], %[zero] \n\t" - "punpcklbh %[y02], %[y02], %[zero] \n\t" - - "punpcklbh %[y10], %[y10], %[zero] \n\t" - "punpcklbh %[y11], %[y11], %[zero] \n\t" - "punpcklbh %[y12], %[y12], %[zero] \n\t" - - "paddh %[y00], %[y00], %[y01] \n\t" - "paddh %[y02], %[y02], %[y01] \n\t" - "paddh %[y00], %[y00], %[y02] \n\t" - - "paddh %[y10], %[y10], %[y11] \n\t" - "paddh %[y12], %[y12], %[y11] \n\t" - "paddh %[y10], %[y10], %[y12] \n\t" - - "pmaxsh %[y02], %[y00], %[y10] \n\t" - "pminsh %[y12], %[y00], %[y10] \n\t" - "psubh %[y00], %[y02], %[y12] \n\t" - - "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 - "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t" - "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t" - - "daddiu %[src_y0], %[src_y0], 8 \n\t" - "daddiu %[src_y1], %[src_y1], 8 \n\t" - "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01), - [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12) - : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), - [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero) - : "memory"); -} - -void SobelRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - double temp[3]; - uint64_t c1 = 0xff000000ff000000; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i] - "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] - "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t" - // s7 s6 s5 s4 s3 s2 s1 s0 = a+b - "paddusb %[t2] , %[t0], %[t1] \n\t" - - // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0 - "punpcklbh %[t0], %[t2], %[t2] \n\t" - - // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0 - "punpcklbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - // 255 s1 s1 s1 s55 s0 s0 s0 - "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t" - - // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2 - "punpckhbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - // 255 s3 s3 s3 255 s2 s2 s2 - "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t" - - // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4 - "punpckhbh %[t0], %[t2], %[t2] \n\t" - - // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4 - "punpcklbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t" - - // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6 - "punpckhbh %[t1], %[t0], %[t0] \n\t" - "or %[t1], %[t1], %[c1] \n\t" - "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t" - "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t" - - "daddiu %[dst_argb], %[dst_argb], 32 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) - : "memory"); -} - -void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_y, - int width) { - uint64_t tr = 0; - uint64_t tb = 0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t" - "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t" - "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i] - "paddusb %[tr], %[tr], %[tb] \n\t" // g - "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t" - - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [tr] "=&f"(tr), [tb] "=&f"(tb) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_y] "r"(dst_y), [width] "r"(width) - : "memory"); -} - -void SobelXYRow_MMI(const uint8_t* src_sobelx, - const uint8_t* src_sobely, - uint8_t* dst_argb, - int width) { - uint64_t temp[3]; - uint64_t result = 0; - uint64_t gb = 0; - uint64_t cr = 0; - uint64_t c1 = 0xffffffffffffffff; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t" - "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] - "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t" - "paddusb %[tg] , %[tr], %[tb] \n\t" // g - - // g3 b3 g2 b2 g1 b1 g0 b0 - "punpcklbh %[gb], %[tb], %[tg] \n\t" - // c3 r3 r2 r2 c1 r1 c0 r0 - "punpcklbh %[cr], %[tr], %[c1] \n\t" - // c1 r1 g1 b1 c0 r0 g0 b0 - "punpcklhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t" - // c3 r3 g3 b3 c2 r2 g2 b2 - "punpckhhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t" - - // g7 b7 g6 b6 g5 b5 g4 b4 - "punpckhbh %[gb], %[tb], %[tg] \n\t" - // c7 r7 c6 r6 c5 r5 c4 r4 - "punpckhbh %[cr], %[tr], %[c1] \n\t" - // c5 r5 g5 b5 c4 r4 g4 b4 - "punpcklhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t" - // c7 r7 g7 b7 c6 r6 g6 b6 - "punpckhhw %[result], %[gb], %[cr] \n\t" - "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t" - "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t" - - "daddiu %[dst_argb], %[dst_argb], 32 \n\t" - "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" - "daddiu %[src_sobely], %[src_sobely], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]), - [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result) - : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), - [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) - : "memory"); -} - -void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { - // Copy a Y to RGB. - uint64_t src, dest; - const uint64_t mask0 = 0x00ffffff00ffffffULL; - const uint64_t mask1 = ~mask0; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src], %[src], %[src] \n\t" - "punpcklhw %[dest], %[src], %[src] \n\t" - "and %[dest], %[dest], %[mask0] \n\t" - "or %[dest], %[dest], %[mask1] \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - - "punpckhhw %[dest], %[src], %[src] \n\t" - "and %[dest], %[dest], %[mask0] \n\t" - "or %[dest], %[dest], %[mask1] \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -// TODO - respect YuvConstants -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, - const struct YuvConstants*, int width) { - uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x55; - const uint64_t mask2 = 0xAA; - const uint64_t mask3 = 0xFF; - const uint64_t mask4 = 0x4A354A354A354A35ULL; - const uint64_t mask5 = 0x0488048804880488ULL; - const uint64_t shift0 = 0x08; - const uint64_t shift1 = 0x06; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - - "pshufh %[src], %[src_lo], %[mask0] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_lo], %[mask1] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_lo], %[mask2] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_lo], %[mask3] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_hi], %[mask0] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_hi], %[mask1] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - - "pshufh %[src], %[src_hi], %[mask2] \n\t" - "psllh %[dest_lo], %[src], %[shift0] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src] \n\t" - "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" - "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" - "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" - "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" - "pshufh %[src], %[src_hi], %[mask3] \n\t" - "psllh %[dest_hi], %[src], %[shift0] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src] \n\t" - "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" - "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" - "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" - "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo) - : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), - [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0), - [shift1] "f"(shift1), [width] "r"(width) - : "memory"); -} - -void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, src0, src1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x1b; - - src += width - 1; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[source], 0(%[src_ptr]) \n\t" - "gsldrc1 %[source], -7(%[src_ptr]) \n\t" - "punpcklbh %[src0], %[source], %[mask0] \n\t" - "pshufh %[src0], %[src0], %[mask1] \n\t" - "punpckhbh %[src1], %[source], %[mask0] \n\t" - "pshufh %[src1], %[src1], %[mask1] \n\t" - "packushb %[dest], %[src1], %[src0] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], -0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0), - [src1] "=&f"(src1) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void MirrorSplitUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t src0, src1, dest0, dest1; - const uint64_t mask0 = 0x00ff00ff00ff00ffULL; - const uint64_t mask1 = 0x1b; - const uint64_t shift = 0x08; - - src_uv += (width - 1) << 1; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 1(%[src_ptr]) \n\t" - "gsldrc1 %[src0], -6(%[src_ptr]) \n\t" - "gsldlc1 %[src1], -7(%[src_ptr]) \n\t" - "gsldrc1 %[src1], -14(%[src_ptr]) \n\t" - - "and %[dest0], %[src0], %[mask0] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "and %[dest1], %[src1], %[mask0] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t" - - "psrlh %[dest0], %[src0], %[shift] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "psrlh %[dest1], %[src1], %[shift] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t" - - "daddi %[src_ptr], %[src_ptr], -0x10 \n\t" - "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t" - "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), - [src1] "=&f"(src1) - : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v), - [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), - [shift] "f"(shift) - : "memory"); -} - -void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - src += (width - 1) * 4; - uint64_t temp = 0x0; - uint64_t shuff = 0x4e; // 01 00 11 10 - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[temp], 3(%[src]) \n\t" - "gsldrc1 %[temp], -4(%[src]) \n\t" - "pshufh %[temp], %[temp], %[shuff] \n\t" - "gssdrc1 %[temp], 0x0(%[dst]) \n\t" - "gssdlc1 %[temp], 0x7(%[dst]) \n\t" - - "daddiu %[src], %[src], -0x08 \n\t" - "daddiu %[dst], %[dst], 0x08 \n\t" - "daddiu %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [temp] "=&f"(temp) - : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff) - : "memory"); -} - -void SplitUVRow_MMI(const uint8_t* src_uv, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t" - - "and %[t2], %[t0], %[c0] \n\t" - "and %[t3], %[t1], %[c0] \n\t" - "packushb %[t2], %[t2], %[t3] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t" - - "psrlh %[t2], %[t0], %[shift] \n\t" - "psrlh %[t3], %[t1], %[shift] \n\t" - "packushb %[t2], %[t2], %[t3] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t" - - "daddiu %[src_uv], %[src_uv], 16 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [t3] "=&f"(temp[3]) - : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -void MergeUVRow_MMI(const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_uv, - int width) { - uint64_t temp[3]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x0(%[src_u]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_u]) \n\t" - "gsldrc1 %[t1], 0x0(%[src_v]) \n\t" - "gsldlc1 %[t1], 0x7(%[src_v]) \n\t" - "punpcklbh %[t2], %[t0], %[t1] \n\t" - "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t" - "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t" - "punpckhbh %[t2], %[t0], %[t1] \n\t" - "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t" - "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t" - - "daddiu %[src_u], %[src_u], 8 \n\t" - "daddiu %[src_v], %[src_v], 8 \n\t" - "daddiu %[dst_uv], %[dst_uv], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) - : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v), - [width] "r"(width) - : "memory"); -} - -void SplitRGBRow_MMI(const uint8_t* src_rgb, - uint8_t* dst_r, - uint8_t* dst_g, - uint8_t* dst_b, - int width) { - uint64_t src[4]; - uint64_t dest_hi, dest_lo, dest; - - __asm__ volatile( - "1: \n\t" - "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" - "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" - "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[src1] \n\t" - "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t" - "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t" - "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t" - "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src2], %[src3] \n\t" - - "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t" - "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t" - "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t" - "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" - "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t" - "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t" - "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]), - [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g), - [dstb_ptr] "r"(dst_b), [width] "r"(width) - : "memory"); -} - -void MergeRGBRow_MMI(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_rgb, - int width) { - uint64_t srcr, srcg, srcb, dest; - uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo; - const uint64_t temp = 0x0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t" - "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t" - "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t" - "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t" - "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t" - "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t" - - "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t" - "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t" - "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t" - "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t" - - "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" - "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t" - "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" - "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t" - "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" - "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" - "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t" - "punpckhwd %[dest], %[dest], %[dest] \n\t" - "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t" - "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t" - - "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t" - "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t" - "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb), - [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi), - [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi), - [srcbz_lo] "=&f"(srcbz_lo) - : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b), - [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp) - : "memory"); -} - -// Filter 2 rows of YUY2 UV's (422) into U and V (420). -void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, - int src_stride_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0xff00ff00ff00ff00; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t temp[3]; - uint64_t data[4]; - uint64_t shift = 0x08; - uint64_t src_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t" - "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c1] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t" - "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c1] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), - [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) - : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) - : "memory"); -} - -// Copy row of YUY2 UV's (422) into U and V (422). -void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - uint64_t c0 = 0xff00ff00ff00ff00; - uint64_t c1 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - uint64_t data[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c1] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "psrlh %[t0], %[t0], %[shift] \n\t" - "psrlh %[t1], %[t1], %[shift] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c1] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), - [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) - : "memory"); -} - -// Copy row of YUY2 Y's (422) into Y (420/422). -void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" - "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t" - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) - : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width), - [c0] "f"(c0) - : "memory"); -} - -// Filter 2 rows of UYVY UV's (422) into U and V (420). -void UYVYToUVRow_MMI(const uint8_t* src_uyvy, - int src_stride_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[3]; - uint64_t data[4]; - uint64_t shift = 0x08; - uint64_t src_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t" - "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c0] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" - - "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t" - "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" - "pavgb %[t1], %[t2], %[t1] \n\t" - - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c0] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), - [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), - [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) - : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy), - [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), - [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Copy row of UYVY UV's (422) into U and V (422). -void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - // Output a row of UV values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t temp[2]; - uint64_t data[4]; - uint64_t shift = 0x08; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d0], %[t0], %[c0] \n\t" - "psrlh %[d1], %[t1], %[shift] \n\t" - - "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "mov.s %[t1], %[t0] \n\t" - "and %[d2], %[t0], %[c0] \n\t" - "psrlh %[d3], %[t1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d2] \n\t" - "packushb %[d1], %[d1], %[d3] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" - "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" - "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" - "daddiu %[dst_u], %[dst_u], 8 \n\t" - "daddiu %[dst_v], %[dst_v], 8 \n\t" - "daddiu %[width], %[width], -16 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), - [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), - [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Copy row of UYVY Y's (422) into Y (420/422). -void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - // Output a row of Y values. - uint64_t c0 = 0x00ff00ff00ff00ff; - uint64_t shift = 0x08; - uint64_t temp[2]; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" - "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" - "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" - "dsrl %[t0], %[t0], %[shift] \n\t" - "dsrl %[t1], %[t1], %[shift] \n\t" - "and %[t0], %[t0], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "and %[t1], %[t1], %[c0] \n\t" - "packushb %[t0], %[t0], %[t1] \n\t" - "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" - "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" - "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t" - "daddiu %[dst_y], %[dst_y], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) - : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width), - [c0] "f"(c0), [shift] "f"(shift) - : "memory"); -} - -// Blend src_argb over src_argb1 and store to dst_argb. -// dst_argb may be src_argb or src_argb1. -// This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_MMI(const uint8_t* src_argb, - const uint8_t* src_argb1, - uint8_t* dst_argb, - int width) { - uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi, - dest_lo; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL; - const uint64_t mask2 = 0x00FF00FF00FF00FFULL; - const uint64_t mask3 = 0xFF; - const uint64_t mask4 = ~mask1; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" - - "psubush %[alpha], %[mask2], %[src0_lo] \n\t" - "pshufh %[alpha], %[alpha], %[mask3] \n\t" - "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t" - - "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" - - "psubush %[alpha], %[mask2], %[src0_hi] \n\t" - "pshufh %[alpha], %[alpha], %[mask3] \n\t" - "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[mask4] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha), - [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) - : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), - [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), - [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -void BlendPlaneRow_MMI(const uint8_t* src0, - const uint8_t* src1, - const uint8_t* alpha, - uint8_t* dst, - int width) { - uint64_t source0, source1, dest, alph; - uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi, - dest_lo; - uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL; - const uint64_t mask2 = 0x00FF00FF00FF00FFULL; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" - "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" - - "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" - "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" - "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" - - "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t" - "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t" - "psubusb %[alpha_r], %[mask1], %[alpha] \n\t" - "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t" - "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t" - "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t" - "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t" - - "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t" - "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t" - "paddush %[dest_lo], %[dest_lo], %[dest] \n\t" - "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - - "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t" - "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t" - "paddush %[dest_hi], %[dest_hi], %[dest] \n\t" - "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph), - [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), - [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), - [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi), - [alpha_r] "=&f"(alpha_rev) - : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha), - [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -// Multiply source RGB by alpha and store to destination. -// This code mimics the SSSE3 version for better testability. -void ARGBAttenuateRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - int width) { - uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha; - const uint64_t mask0 = 0xFF; - const uint64_t mask1 = 0xFF000000FF000000ULL; - const uint64_t mask2 = ~mask1; - const uint64_t shift = 0x08; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[src] \n\t" - "punpckhbh %[src_hi], %[src], %[src] \n\t" - - "pshufh %[alpha], %[src_lo], %[mask0] \n\t" - "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" - "pshufh %[alpha], %[src_hi], %[mask0] \n\t" - "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "and %[dest], %[dest], %[mask2] \n\t" - "and %[src], %[src], %[mask1] \n\t" - "or %[dest], %[dest], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift), - [width] "r"(width) - : "memory"); -} - -void ComputeCumulativeSumRow_MMI(const uint8_t* row, - int32_t* cumsum, - const int32_t* previous_cumsum, - int width) { - int64_t row_sum[2] = {0, 0}; - uint64_t src, dest0, dest1, presrc0, presrc1, dest; - const uint64_t mask = 0x0; - - __asm__ volatile( - "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t" - "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t" - - "1: \n\t" - "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t" - "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t" - - "punpcklbh %[src], %[src], %[mask] \n\t" - "punpcklhw %[dest0], %[src], %[mask] \n\t" - "punpckhhw %[dest1], %[src], %[mask] \n\t" - - "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t" - "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t" - - "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t" - "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t" - "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t" - "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t" - - "paddw %[dest0], %[row_sum0], %[presrc0] \n\t" - "paddw %[dest1], %[row_sum1], %[presrc1] \n\t" - - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t" - "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x01 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]), - [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0), - [presrc1] "=&f"(presrc1) - : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum), - [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask) - : "memory"); -} - -// C version 2x2 -> 2x1. -void InterpolateRow_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - ptrdiff_t src_stride, - int width, - int source_y_fraction) { - if (source_y_fraction == 0) { - __asm__ volatile( - "1: \n\t" - "ld $t0, 0x0(%[src_ptr]) \n\t" - "sd $t0, 0x0(%[dst_ptr]) \n\t" - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : - : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width) - : "memory"); - return; - } - if (source_y_fraction == 128) { - uint64_t uv = 0x0; - uint64_t uv_stride = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t" - "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t" - "daddu $t0, %[src_ptr], %[stride] \n\t" - "gsldrc1 %[uv_stride], 0x0($t0) \n\t" - "gsldlc1 %[uv_stride], 0x7($t0) \n\t" - - "pavgb %[uv], %[uv], %[uv_stride] \n\t" - "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t" - "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width), - [stride] "r"((int64_t)src_stride) - : "memory"); - return; - } - const uint8_t* src_ptr1 = src_ptr + src_stride; - uint64_t temp; - uint64_t data[4]; - uint64_t zero = 0x0; - uint64_t c0 = 0x0080008000800080; - uint64_t fy0 = 0x0100010001000100; - uint64_t shift = 0x8; - __asm__ volatile( - "pshufh %[fy1], %[fy1], %[zero] \n\t" - "psubh %[fy0], %[fy0], %[fy1] \n\t" - "1: \n\t" - "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t" - "punpcklbh %[d0], %[t0], %[zero] \n\t" - "punpckhbh %[d1], %[t0], %[zero] \n\t" - "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t" - "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t" - "punpcklbh %[d2], %[t0], %[zero] \n\t" - "punpckhbh %[d3], %[t0], %[zero] \n\t" - - "pmullh %[d0], %[d0], %[fy0] \n\t" - "pmullh %[d2], %[d2], %[fy1] \n\t" - "paddh %[d0], %[d0], %[d2] \n\t" - "paddh %[d0], %[d0], %[c0] \n\t" - "psrlh %[d0], %[d0], %[shift] \n\t" - - "pmullh %[d1], %[d1], %[fy0] \n\t" - "pmullh %[d3], %[d3], %[fy1] \n\t" - "paddh %[d1], %[d1], %[d3] \n\t" - "paddh %[d1], %[d1], %[c0] \n\t" - "psrlh %[d1], %[d1], %[shift] \n\t" - - "packushb %[d0], %[d0], %[d1] \n\t" - "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t" - "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t" - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), - [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) - : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1), - [dst_ptr] "r"(dst_ptr), [width] "r"(width), - [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0), - [shift] "f"(shift), [zero] "f"(zero) - : "memory"); -} - -// Use first 4 shuffler values to reorder ARGB channels. -void ARGBShuffleRow_MMI(const uint8_t* src_argb, - uint8_t* dst_argb, - const uint8_t* shuffler, - int width) { - uint64_t source, dest0, dest1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) | - ((shuffler[2] & 0x03) << 4) | - ((shuffler[3] & 0x03) << 6); - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest0], %[src], %[mask0] \n\t" - "pshufh %[dest0], %[dest0], %[mask1] \n\t" - "punpckhbh %[dest1], %[src], %[mask0] \n\t" - "pshufh %[dest1], %[dest1], %[mask1] \n\t" - "packushb %[dest], %[dest0], %[dest1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void I422ToYUY2Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - uint64_t temp[3]; - uint64_t vu = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] - "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] - "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] - "punpcklbh %[vu], %[tu], %[tv] \n\t" // g - "punpcklbh %[tu], %[ty], %[vu] \n\t" // g - "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" - "punpckhbh %[tu], %[ty], %[vu] \n\t" // g - "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" - "daddiu %[src_y], %[src_y], 8 \n\t" - "daddiu %[src_u], %[src_u], 4 \n\t" - "daddiu %[src_v], %[src_v], 4 \n\t" - "daddiu %[dst_frame], %[dst_frame], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), - [vu] "=&f"(vu) - : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), - [dst_frame] "r"(dst_frame), [width] "r"(width) - : "memory"); -} - -void I422ToUYVYRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_frame, - int width) { - uint64_t temp[3]; - uint64_t vu = 0x0; - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] - "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] - "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] - "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] - "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] - "punpcklbh %[vu], %[tu], %[tv] \n\t" // g - "punpcklbh %[tu], %[vu], %[ty] \n\t" // g - "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" - "punpckhbh %[tu], %[vu], %[ty] \n\t" // g - "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" - "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" - "daddiu %[src_y], %[src_y], 8 \n\t" - "daddiu %[src_u], %[src_u], 4 \n\t" - "daddiu %[src_v], %[src_v], 4 \n\t" - "daddiu %[dst_frame], %[dst_frame], 16 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" - : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), - [vu] "=&f"(vu) - : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), - [dst_frame] "r"(dst_frame), [width] "r"(width) - : "memory"); -} - -void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, dest; - const uint64_t mask0 = 0xff000000ff000000ULL; - const uint64_t mask1 = ~mask0; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "and %[src], %[src], %[mask0] \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[src], %[dest] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, - uint8_t* dst_a, - int width) { - uint64_t src, dest0, dest1, dest_lo, dest_hi, dest; - const uint64_t mask = 0xff000000ff000000ULL; - const uint64_t shift = 0x18; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "and %[dest0], %[src], %[mask] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" - "and %[dest1], %[src], %[mask] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - - "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" - "and %[dest0], %[src], %[mask] \n\t" - "psrlw %[dest0], %[dest0], %[shift] \n\t" - "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" - "and %[dest1], %[src], %[mask] \n\t" - "psrlw %[dest1], %[dest1], %[shift] \n\t" - "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask), - [shift] "f"(shift), [width] "r"(width) - : "memory"); -} - -void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { - uint64_t source, dest0, dest1, dest; - const uint64_t mask0 = 0x0; - const uint64_t mask1 = 0x00ffffff00ffffffULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest0], %[mask0], %[src] \n\t" - "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "punpckhbh %[dest0], %[mask0], %[src] \n\t" - "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" - "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" - "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - "and %[dest], %[dest], %[mask1] \n\t" - "or %[dest], %[dest], %[dest1] \n\t" - "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), - [mask1] "f"(mask1), [width] "r"(width) - : "memory"); -} - -void I444ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - __asm__ volatile ( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub - "or %[ub], %[ub], %[mask] \n\t"//must sign extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t"//sign extension - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - "punpcklbh %[u], %[u], %[zero] \n\t"//u - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - "punpcklbh %[v], %[v], %[zero] \n\t"//v - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -// Also used for 420 -void I422ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub - "or %[ub], %[ub], %[mask] \n\t"//must sign extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t"//sign extension - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t"//v - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -// 10 bit YUV to ARGB -void I210ToARGBRow_MMI(const uint16_t* src_y, - const uint16_t* src_u, - const uint16_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "psllh %[y], %[y], %[six] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "punpcklhw %[u], %[u], %[u] \n\t" - "psrah %[u], %[u], %[two] \n\t" - "punpcklhw %[v], %[v], %[v] \n\t" - "psrah %[v], %[v], %[two] \n\t" - "pminsh %[u], %[u], %[mask1] \n\t" - "pminsh %[v], %[v], %[mask1] \n\t" - - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask), [two]"f"(0x02), - [mask1]"f"(0x00ff00ff00ff00ff) - : "memory" - ); -} - -void I422AlphaToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v,a; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" - "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[a] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), [a]"=&f"(a), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [a_ptr]"r"(src_a), [zero]"f"(0x00), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); -} - -void I422ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; - uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - - "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" - "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" - "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" - "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" - "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" - "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" - "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" - "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" - "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask]"f"(mask), - [lmove1]"f"(0x18), [rmove1]"f"(0x8), - [one]"f"(0x1) - : "memory" - ); -} - -void I422ToARGB4444Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "and %[g_vec], %[g_vec], %[mask1] \n\t" - "psrlw %[g_vec], %[g_vec], %[four] \n\t" - "psrlw %[r_vec], %[g_vec], %[four] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[g_vec], %[g_vec], %[r_vec] \n\t" - - "and %[b_vec], %[b_vec], %[mask1] \n\t" - "psrlw %[b_vec], %[b_vec], %[four] \n\t" - "psrlw %[r_vec], %[b_vec], %[four] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[b_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), - [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), - [alpha]"f"(-1) - : "memory" - ); -} - -void I422ToARGB1555Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlw %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "or %[g_vec], %[g_vec], %[mask3] \n\t" - - "psrlw %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "or %[b_vec], %[b_vec], %[mask3] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [mask3]"f"(0x800000008000), - [lmove5]"f"(0x5) - : "memory" - ); -} - -void I422ToRGB565Row_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [seven]"f"(0x7), - [lmove5]"f"(0x5) - : "memory" - ); -} - -void NV12ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1) - : "memory" - ); -} - -void NV21ToARGBRow_MMI(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1) - : "memory" - ); -} - -void NV12ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [lmove1]"f"(0x18), - [one]"f"(0x1), [rmove1]"f"(0x8) - : "memory" - ); -} - -void NV21ToRGB24Row_MMI(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [lmove1]"f"(0x18), [rmove1]"f"(0x8), - [one]"f"(0x1) - : "memory" - ); -} - -void NV12ToRGB565Row_MMI(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t"//5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t"//5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [dst_rgb565]"r"(dst_rgb565), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [seven]"f"(0x7) - : "memory" - ); -} - -void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" - "psrlh %[temp], %[y], %[eight] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" - - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[y], %[y], %[temp] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [eight]"f"(0x8) - : "memory" - ); -} - -void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[temp], %[y], %[temp] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" - - "psrlh %[y], %[y], %[eight] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [eight]"f"(0x8) - : "memory" - ); -} - -void I422ToRGBARow_MMI(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - uint64_t y, u, v; - uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" - - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" - - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" - - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" - - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" - - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" - "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" - "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [alpha]"f"(-1) - : "memory" - ); -} - -void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { - __asm__ volatile ( - "punpcklwd %[v32], %[v32], %[v32] \n\t" - "1: \n\t" - "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" - - "daddi %[width], %[width], -0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "bnez %[width], 1b \n\t" - : [v32]"+&f"(v32) - : [dst_ptr]"r"(dst_argb), [width]"r"(width) - : "memory" - ); -} -// clang-format on - -// 10 bit YUV to ARGB -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc index 804ff839..4ed13638 100644 --- a/files/source/row_neon.cc +++ b/files/source/row_neon.cc @@ -89,12 +89,14 @@ extern "C" { "vsli.u16 d2, d2, #8 \n" \ "vsri.u16 d3, d3, #8 \n" +// TODO: Use single register for kUVCoeff and multiply by lane #define YUVTORGB_SETUP \ + "vld1.16 {d31}, [%[kRGBCoeffBias]] \n" \ "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \ - "vld1.16 {d31[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d20[], d21[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d22[], d23[]}, [%[kRGBCoeffBias]]! \n" \ - "vld1.16 {d24[], d25[]}, [%[kRGBCoeffBias]] \n" + "vdup.u16 q10, d31[1] \n" \ + "vdup.u16 q11, d31[2] \n" \ + "vdup.u16 q12, d31[3] \n" \ + "vdup.u16 d31, d31[0] \n" // q0: B uint16x8_t // q1: G uint16x8_t @@ -156,6 +158,29 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, : "cc", "memory", YUVTORGB_REGS, "d6"); } +void I444ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -586,10 +611,10 @@ void DetileRow_NEON(const uint8_t* src, int width) { asm volatile( "1: \n" - "vld1.16 {q0}, [%0], %3 \n" // load 16 bytes + "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes "subs %2, %2, #16 \n" // 16 processed per loop - "pld [%0, 1792] \n" - "vst1.16 {q0}, [%1]! \n" // store 16 bytes + "pld [%0, #1792] \n" + "vst1.8 {q0}, [%1]! \n" // store 16 bytes "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -599,6 +624,26 @@ void DetileRow_NEON(const uint8_t* src, ); } +// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's. +void DetileRow_16_NEON(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "pld [%0, #3584] \n" + "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride * 2) // %3 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. void DetileSplitUVRow_NEON(const uint8_t* src_uv, ptrdiff_t src_tile_stride, @@ -609,7 +654,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, "1: \n" "vld2.8 {d0, d1}, [%0], %4 \n" "subs %3, %3, #16 \n" - "pld [%0, 1792] \n" + "pld [%0, #1792] \n" "vst1.8 {d0}, [%1]! \n" "vst1.8 {d1}, [%2]! \n" "bgt 1b \n" @@ -622,6 +667,101 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, ); } +#if LIBYUV_USE_ST2 +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y + "pld [%0, #1792] \n" + "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV + "pld [%1, #1792] \n" + "subs %3, %3, #16 \n" + "vst2.8 {q0, q1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber list + ); +} +#else +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y + "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV + "subs %3, %3, #16 \n" + "pld [%0, #1792] \n" + "vzip.8 q0, q1 \n" + "pld [%1, #1792] \n" + "vst1.8 {q0, q1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber list + ); +} +#endif + +void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { + asm volatile( + "1: \n" + "vld1.8 {q14}, [%0]! \n" // Load lower bits. + "vld1.8 {q9}, [%0]! \n" // Load upper bits row + // by row. + "vld1.8 {q11}, [%0]! \n" + "vld1.8 {q13}, [%0]! \n" + "vld1.8 {q15}, [%0]! \n" + "vshl.u8 q8, q14, #6 \n" // Shift lower bit data + // appropriately. + "vshl.u8 q10, q14, #4 \n" + "vshl.u8 q12, q14, #2 \n" + "vzip.u8 q8, q9 \n" // Interleave upper and + // lower bits. + "vzip.u8 q10, q11 \n" + "vzip.u8 q12, q13 \n" + "vzip.u8 q14, q15 \n" + "vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits + // into lower 6 bits for + // better accuracy in + // conversions. + "vsri.u16 q9, q9, #10 \n" + "vsri.u16 q10, q10, #10 \n" + "vsri.u16 q11, q11, #10 \n" + "vsri.u16 q12, q12, #10 \n" + "vsri.u16 q13, q13, #10 \n" + "vsri.u16 q14, q14, #10 \n" + "vsri.u16 q15, q15, #10 \n" + "vstmia %1!, {q8-q15} \n" // Store pixel block (64 + // pixels). + "subs %2, %2, #80 \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(size) // %2 + : + : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); +} + // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, @@ -664,7 +804,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, "+r"(dst_b), // %3 "+r"(width) // %4 : // Input registers - : "cc", "memory", "d0", "d1", "d2" // Clobber List + : "cc", "memory", "q0", "q1", "q2" // Clobber List ); } @@ -1505,6 +1645,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, ); } +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 q4, q1, q3 \n" // average rows of UV + "vst1.8 {q4}, [%2]! \n" // store 8 UV. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, @@ -1590,7 +1753,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( "vdup.32 d7, %2 \n" // dither4 @@ -1762,7 +1925,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } -// TODO(fbarchard): Subsample match C code. +// TODO(fbarchard): Subsample match Intel code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1808,6 +1971,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, ); } +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_uj), // %2 + "+r"(dst_vj), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + // TODO(fbarchard): Subsample match C code. void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, @@ -2567,6 +2775,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); } +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + // RGBA expects first value to be A and ignored, then 3 values to contain RGB. // Same code as ARGB, except the LD4 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, @@ -3633,7 +3845,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, "+r"(dst_v), // %2 "+r"(width) // %3 : "r"(shift) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); + : "cc", "memory", "q0", "q1", "q2"); } void MergeUVRow_16_NEON(const uint16_t* src_u, @@ -3687,31 +3899,25 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "vdup.16 q0, %3 \n" - "1: \n" - "vld1.16 {q1}, [%0]! \n" - "vld1.16 {q2}, [%0]! \n" - "vmovl.u16 q3, d2 \n" - "vmovl.u16 q1, d3 \n" - "vmovl.u16 q4, d4 \n" - "vmovl.u16 q2, d5 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vshl.u32 q1, q1, q0 \n" - "vshl.u32 q2, q2, q0 \n" - "vmovn.u32 d2, q3 \n" - "vmovn.u32 d3, q1 \n" - "vmovn.u32 d4, q4 \n" - "vmovn.u32 d5, q2 \n" - "vst1.16 {q1}, [%1]! \n" - "vst1.16 {q2}, [%1]! \n" + "vdup.16 d8, %3 \n" + "1: \n" + "vld1.16 {q2, q3}, [%0]! \n" + "vmull.u16 q0, d4, d8 \n" + "vmull.u16 q1, d5, d8 \n" + "vmull.u16 q2, d6, d8 \n" + "vmull.u16 q3, d7, d8 \n" + "vshrn.u32 d0, q0, #16 \n" + "vshrn.u32 d1, q1, #16 \n" + "vshrn.u32 d2, q2, #16 \n" + "vshrn.u32 d3, q3, #16 \n" + "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels "subs %2, %2, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); + : "cc", "memory", "q0", "q1", "q2", "q3", "d8"); } // Use scale to convert lsb formats to msb, depending how many bits there are: diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc index 0f120373..74190d61 100644 --- a/files/source/row_neon64.cc +++ b/files/source/row_neon64.cc @@ -142,6 +142,29 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, : "cc", "memory", YUVTORGB_REGS, "v19"); } +void I444ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); +} + void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -627,6 +650,26 @@ void DetileRow_NEON(const uint8_t* src, ); } +// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's. +void DetileRow_16_NEON(const uint16_t* src, + ptrdiff_t src_tile_stride, + uint16_t* dst, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead + "st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(src_tile_stride * 2) // %3 + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V. void DetileSplitUVRow_NEON(const uint8_t* src_uv, ptrdiff_t src_tile_stride, @@ -650,6 +693,100 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, ); } +#if LIBYUV_USE_ST2 +// Read 16 Y, 8 UV, and write 8 YUY2 +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys + "prfm pldl1keep, [%0, 1792] \n" + "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs + "prfm pldl1keep, [%1, 1792] \n" + "subs %w3, %w3, #16 \n" // store 8 YUY2 + "st2 {v0.16b,v1.16b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "v0", "v1" // Clobber list + ); +} +#else +// Read 16 Y, 8 UV, and write 8 YUY2 +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys + "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%0, 1792] \n" + "zip1 v2.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 1792] \n" + "zip2 v3.16b, v0.16b, v1.16b \n" + "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2 + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list + ); +} +#endif + +// Unpack MT2T into tiled P010 64 pixels at a time. See +// tinyurl.com/mtk-10bit-video-format for format documentation. +void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { + asm volatile( + "1: \n" + "ld1 {v7.16b}, [%0], #16 \n" + "ld1 {v0.16b-v3.16b}, [%0], #64 \n" + "shl v4.16b, v7.16b, #6 \n" + "shl v5.16b, v7.16b, #4 \n" + "shl v6.16b, v7.16b, #2 \n" + "subs %2, %2, #80 \n" + "zip1 v16.16b, v4.16b, v0.16b \n" + "zip1 v18.16b, v5.16b, v1.16b \n" + "zip1 v20.16b, v6.16b, v2.16b \n" + "zip1 v22.16b, v7.16b, v3.16b \n" + "zip2 v17.16b, v4.16b, v0.16b \n" + "zip2 v19.16b, v5.16b, v1.16b \n" + "zip2 v21.16b, v6.16b, v2.16b \n" + "zip2 v23.16b, v7.16b, v3.16b \n" + "sri v16.8h, v16.8h, #10 \n" + "sri v17.8h, v17.8h, #10 \n" + "sri v18.8h, v18.8h, #10 \n" + "sri v19.8h, v19.8h, #10 \n" + "st1 {v16.8h-v19.8h}, [%1], #64 \n" + "sri v20.8h, v20.8h, #10 \n" + "sri v21.8h, v21.8h, #10 \n" + "sri v22.8h, v22.8h, #10 \n" + "sri v23.8h, v23.8h, #10 \n" + "st1 {v20.8h-v23.8h}, [%1], #64 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(size) // %2 + : + : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); +} + #if LIBYUV_USE_ST2 // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, @@ -1729,6 +1866,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, ); } +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row + "urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV + "prfm pldl1keep, [%0, 448] \n" + "st1 {v4.16b}, [%2], #16 \n" // store 8 UV. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, @@ -1819,24 +1979,23 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { asm volatile( - "dup v1.4s, %w2 \n" // dither4 + "dup v1.4s, %w3 \n" // dither4 "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // load 8 - // pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. "uqadd v16.8b, v16.8b, v1.8b \n" "prfm pldl1keep, [%0, 448] \n" "uqadd v17.8b, v17.8b, v1.8b \n" "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565 - "st1 {v18.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_rgb), // %1 + "+r"(width) // %2 + : "r"(dither4) // %3 : "cc", "memory", "v1", "v16", "v17", "v18", "v19"); } @@ -2144,6 +2303,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } +// TODO(fbarchard): Subsample match Intel code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2189,6 +2349,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, ); } +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_abgr_1), // %1 + "+r"(dst_uj), // %2 + "+r"(dst_vj), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -2812,6 +3017,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); } +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + // RGBA expects first value to be A and ignored, then 3 values to contain RGB. // Same code as ARGB, except the LD4 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, @@ -4241,23 +4450,19 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "dup v0.8h, %w3 \n" + "dup v4.8h, %w3 \n" "1: \n" - "ldp q1, q2, [%0], #32 \n" - "ushll v3.4s, v1.4h, #0 \n" - "ushll v4.4s, v2.4h, #0 \n" + "ldp q2, q3, [%0], #32 \n" + "umull v0.4s, v2.4h, v4.4h \n" + "umull2 v1.4s, v2.8h, v4.8h \n" + "umull v2.4s, v3.4h, v4.4h \n" + "umull2 v3.4s, v3.8h, v4.8h \n" "prfm pldl1keep, [%0, 448] \n" - "ushll2 v1.4s, v1.8h, #0 \n" - "ushll2 v2.4s, v2.8h, #0 \n" - "mul v3.4s, v0.4s, v3.4s \n" - "mul v4.4s, v0.4s, v4.4s \n" - "mul v1.4s, v0.4s, v1.4s \n" - "mul v2.4s, v0.4s, v2.4s \n" - "shrn v3.4h, v3.4s, #16 \n" - "shrn v4.4h, v4.4s, #16 \n" - "shrn2 v3.8h, v1.4s, #16 \n" - "shrn2 v4.8h, v2.4s, #16 \n" - "stp q3, q3, [%1], #32 \n" // store 16 pixels + "shrn v0.4h, v0.4s, #16 \n" + "shrn2 v0.8h, v1.4s, #16 \n" + "shrn v1.4h, v2.4s, #16 \n" + "shrn2 v1.8h, v3.4s, #16 \n" + "stp q0, q1, [%1], #32 \n" // store 16 pixels "subs %w2, %w2, #16 \n" // 16 src pixels per loop "b.gt 1b \n" : "+r"(src_y), // %0 diff --git a/files/source/row_rvv.cc b/files/source/row_rvv.cc new file mode 100644 index 00000000..27e91a3b --- /dev/null +++ b/files/source/row_rvv.cc @@ -0,0 +1,956 @@ +/* + * Copyright 2023 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * Contributed by Darren Hsieh + * Contributed by Bruce Lai + */ + +#include "libyuv/row.h" + +#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +#include +#include + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Fill YUV -> RGB conversion constants into vectors +// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode +// register) is set to round-to-nearest-up mode(0). +#define YUVTORGB_SETUP(vl, yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ + { \ + asm volatile("csrwi vxrm, 0"); \ + ub = yuvconst->kUVCoeff[0]; \ + vr = yuvconst->kUVCoeff[1]; \ + ug = yuvconst->kUVCoeff[2]; \ + vg = yuvconst->kUVCoeff[3]; \ + yg = yuvconst->kRGBCoeffBias[0]; \ + bb = yuvconst->kRGBCoeffBias[1] + 32; \ + bg = yuvconst->kRGBCoeffBias[2] - 32; \ + br = yuvconst->kRGBCoeffBias[3] + 32; \ + } + +// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422 +#define READYUV422(vl, v_u, v_v, v_y_16) \ + { \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444 +#define READYUV444(vl, v_u, v_v, v_y_16) \ + { \ + vuint8m2_t v_y; \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_u = __riscv_vle8_v_u8m2(src_u, vl); \ + v_v = __riscv_vle8_v_u8m2(src_v, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Convert from YUV to fixed point RGB +#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \ + v_b_16, v_r_16) \ + { \ + vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ + vuint32m8_t v_tmp5; \ + v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl); \ + v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); \ + v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl); \ + v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl); \ + v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl); \ + v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl); \ + v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl); \ + v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl); \ + v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl); \ + v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl); \ + v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl); \ + v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \ + } + +// Convert from fixed point RGB To 8 bit RGB +#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ + { \ + v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl); \ + v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \ + v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ + } + +void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { + size_t avl = (size_t)4 * width; + do { + vuint16m8_t v_ar64; + vuint8m4_t v_argb; + size_t vl = __riscv_vsetvl_e8m4(avl); + v_argb = __riscv_vle8_v_u8m4(src_argb, vl); + v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl); + v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl); + __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl); + avl -= vl; + src_argb += vl; + dst_ar64 += vl; + } while (avl > 0); +} + +void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m1(avl); + __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl); + v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl); + v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl); + v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl); + v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl); + v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl); + v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl); + v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl); + __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl); + avl -= vl; + src_argb += 4 * vl; + dst_ab64 += 4 * vl; + } while (avl > 0); +} + +void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)4 * width; + do { + vuint16m8_t v_ar64; + vuint8m4_t v_argb; + size_t vl = __riscv_vsetvl_e16m8(avl); + v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl); + v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl); + __riscv_vse8_v_u8m4(dst_argb, v_argb, vl); + avl -= vl; + src_ar64 += vl; + dst_argb += vl; + } while (avl > 0); +} + +void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)width; + do { + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + vuint8m1_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e16m2(avl); + __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl); + v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl); + v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl); + __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + avl -= vl; + src_ab64 += 4 * vl; + dst_argb += 4 * vl; + } while (avl > 0); +} + +void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_raw += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_raw += vl * 3; + dst_rgba += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl); + w -= vl; + src_raw += vl * 3; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl); + w -= vl; + src_argb += vl * 4; + dst_raw += vl * 3; + } while (w > 0); +} + +void ARGBToRGB24Row_RVV(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_argb += vl * 4; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_rgb24 += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void I444ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV444(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void I444AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV444(vl, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void I444ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV444(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void I422ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} + +void I422AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} + +void I422ToRGBARow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgba += vl * 4; + } while (w > 0); +} + +void I422ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(vl, yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgb24 += vl * 3; + } while (w > 0); +} + +void I400ToARGBRow_RVV(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + vuint16m4_t v_yb; + vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) sets to round-to-nearest-up mode(0). + asm volatile("csrwi vxrm, 0"); + if (is_yb_positive) { + v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); + } else { + v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl); + } + do { + vuint8m2_t v_y, v_out; + vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2; + vl = __riscv_vsetvl_e8m2(w); + v_y = __riscv_vle8_v_u8m2(src_y, vl); + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); + v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y + v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl); + if (is_yb_positive) { + v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl); + } else { + v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl); + } + v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_y; + v_y = __riscv_vle8_v_u8m2(src_y, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m8(w); + vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl); + __riscv_vse8_v_u8m8(dst, v_data, vl); + w -= vl; + src += vl; + dst += vl; + } while (w > 0); +} + +// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 +void InterpolateRow_RVV(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + size_t dst_w = (size_t)dst_width; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + // Blend 100 / 0 - Copy row unchanged. + if (y1_fraction == 0) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl); + dst_w -= vl; + src_ptr += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // Blend 50 / 50. + if (y1_fraction == 128) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); + vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); + // Averaging add + vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl); + __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // General purpose row blend. + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); + do { + size_t vl = __riscv_vsetvl_e8m4(dst_w); + vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); + vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); + vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); + acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); + __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); +} + +void SplitRGBRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_rgb += vl * 3; + } while (w > 0); +} + +void MergeRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_rgb += vl * 3; + } while (w > 0); +} + +void SplitARGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_a += vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} + +void MergeARGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + src_a += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void SplitXRGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} + +void MergeXRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_r, v_g, v_b; + v_r = __riscv_vle8_v_u8m2(src_r, vl); + v_g = __riscv_vle8_v_u8m2(src_g, vl); + v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_u, v_v; + __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl); + __riscv_vse8_v_u8m4(dst_u, v_u, vl); + __riscv_vse8_v_u8m4(dst_v, v_v, vl); + w -= vl; + dst_u += vl; + dst_v += vl; + src_uv += 2 * vl; + } while (w > 0); +} + +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + size_t w = (size_t)width; + do { + vuint8m4_t v_u, v_v; + size_t vl = __riscv_vsetvl_e8m4(w); + v_u = __riscv_vle8_v_u8m4(src_u, vl); + v_v = __riscv_vle8_v_u8m4(src_v, vl); + __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl); + w -= vl; + src_u += vl; + src_v += vl; + dst_uv += 2 * vl; + } while (w > 0); +} + +struct RgbConstants { + uint8_t kRGBToY[4]; + uint16_t kAddY; + uint16_t pad; +}; + +// RGB to JPeg coefficients +// B * 0.1140 coefficient = 29 +// G * 0.5870 coefficient = 150 +// R * 0.2990 coefficient = 77 +// Add 0.5 = 0x80 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 128, + 0}; + +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; + +// RGB to BT.601 coefficients +// B * 0.1016 coefficient = 25 +// G * 0.5078 coefficient = 129 +// R * 0.2578 coefficient = 66 +// Add 16.5 = 0x1080 + +static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + 0x1080, + 0}; + +static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, + 0x1080, + 0}; + +// ARGB expects first 3 values to contain RGB and 4th value is ignored. +void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_argb += 4 * vl; + dst_y += vl; + } while (w > 0); +} + +void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants); +} + +void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants); +} + +void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants); +} + +void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + +// RGBA expects first value to be A and ignored, then 3 values to contain RGB. +void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_a, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgba += 4 * vl; + dst_y += vl; + } while (w > 0); +} + +void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants); +} + +void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { + RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants); +} + +void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants); +} + +void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_b, v_g, v_r, v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgb += 3 * vl; + dst_y += vl; + } while (w > 0); +} + +void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); +} + +void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants); +} + +void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants); +} + +void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); +} + +void ARGBAttenuateRow_RVV(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_ba_16, v_ga_16, v_ra_16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); + v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); + v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); + v_b = __riscv_vnclipu_wx_u8m2(v_ba_16, 8, vl); + v_g = __riscv_vnclipu_wx_u8m2(v_ga_16, 8, vl); + v_r = __riscv_vnclipu_wx_u8m2(v_ra_16, 8, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) diff --git a/files/source/row_win.cc b/files/source/row_win.cc index c7c1ff60..5fb28521 100644 --- a/files/source/row_win.cc +++ b/files/source/row_win.cc @@ -14,7 +14,9 @@ #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) -#if defined(_M_X64) +#if defined(_M_ARM64EC) +#include +#elif defined(_M_X64) #include #include // For _mm_maddubs_epi16 #endif @@ -893,7 +895,7 @@ __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { __asm { @@ -940,7 +942,7 @@ __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, - const uint32_t dither4, + uint32_t dither4, int width) { __asm { mov eax, [esp + 4] // src_argb @@ -2789,6 +2791,44 @@ __declspec(naked) void I422ToRGB24Row_SSSE3( } } +// 8 pixels. +// 8 UV values, mixed with 8 Y producing 8 RGB24 (24 bytes). +__declspec(naked) void I444ToRGB24Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebx, [esp + 12 + 20] // yuvconstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 + movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 + + convertloop: + READYUV444 + YUVTORGB(ebx) + STORERGB24 + + sub ecx, 8 + jg convertloop + + pop ebx + pop edi + pop esi + ret + } +} + // 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). __declspec(naked) void I422ToRGB565Row_SSSE3( @@ -3423,17 +3463,14 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, sub edx, eax convertloop: - vmovdqu ymm0, [eax] // read 32 U's - vmovdqu ymm1, [eax + edx] // and 32 V's - lea eax, [eax + 32] - vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 - vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 - vextractf128 [edi], ymm2, 0 // bytes 0..15 - vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 - vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 - vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 - lea edi, [edi + 64] - sub ecx, 32 + vpmovzxbw ymm0, [eax] + vpmovzxbw ymm1, [eax + edx] + lea eax, [eax + 16] + vpsllw ymm1, ymm1, 8 + vpor ymm2, ymm1, ymm0 + vmovdqu [edi], ymm2 + lea edi, [edi + 32] + sub ecx, 16 jg convertloop pop edi diff --git a/files/source/scale.cc b/files/source/scale.cc index e1335f1e..80b030dc 100644 --- a/files/source/scale.cc +++ b/files/source/scale.cc @@ -198,6 +198,51 @@ static void ScalePlaneDown2_16(int src_width, } } +void ScalePlaneDown2_16To8(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, + enum FilterMode filtering) { + int y; + void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width, int scale) = + (src_width & 1) + ? (filtering == kFilterNone + ? ScaleRowDown2_16To8_Odd_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_Odd_C + : ScaleRowDown2Box_16To8_Odd_C)) + : (filtering == kFilterNone + ? ScaleRowDown2_16To8_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16To8_C + : ScaleRowDown2Box_16To8_C)); + int row_stride = src_stride * 2; + (void)dst_height; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (y = 0; y < src_height / 2; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width, scale); + src_ptr += row_stride; + dst_ptr += dst_stride; + } + if (src_height & 1) { + if (!filtering) { + src_ptr -= src_stride; // Point to last row. + } + ScaleRowDown2(src_ptr, 0, dst_ptr, dst_width, scale); + } +} + // Scale plane, 1/4 // This is an optimized version for scaling down a plane to 1/4 of // its original size. @@ -775,9 +820,11 @@ static void ScaleAddCols2_C(int dst_width, int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = - SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >> - 16; + int scaletbl_index = boxwidth - minboxwidth; + assert((scaletbl_index == 0) || (scaletbl_index == 1)); + *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + ix) * + scaletbl[scaletbl_index] >> + 16); } } @@ -797,9 +844,10 @@ static void ScaleAddCols2_16_C(int dst_width, int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * - scaletbl[boxwidth - minboxwidth] >> - 16; + int scaletbl_index = boxwidth - minboxwidth; + assert((scaletbl_index == 0) || (scaletbl_index == 1)); + *dst_ptr++ = + SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[scaletbl_index] >> 16; } } @@ -814,7 +862,7 @@ static void ScaleAddCols0_C(int dst_width, (void)dx; src_ptr += (x >> 16); for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = src_ptr[i] * scaleval >> 16; + *dst_ptr++ = (uint8_t)(src_ptr[i] * scaleval >> 16); } } @@ -829,7 +877,7 @@ static void ScaleAddCols1_C(int dst_width, int i; x >>= 16; for (i = 0; i < dst_width; ++i) { - *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + *dst_ptr++ = (uint8_t)(SumPixels(boxwidth, src_ptr + x) * scaleval >> 16); x += boxwidth; } } @@ -1020,10 +1068,10 @@ void ScalePlaneBilinearDown(int src_width, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1070,6 +1118,11 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1143,10 +1196,10 @@ void ScalePlaneBilinearDown_16(int src_width, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; - void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1231,10 +1284,10 @@ void ScalePlaneBilinearUp(int src_width, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*InterpolateRow)(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + void (*ScaleFilterCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = filtering ? ScaleFilterCols_C : ScaleCols_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1265,6 +1318,11 @@ void ScalePlaneBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; @@ -1315,11 +1373,11 @@ void ScalePlaneBilinearUp(int src_width, const uint8_t* src = src_ptr + yi * (int64_t)src_stride; // Allocate 2 row buffers. - const int kRowSize = (dst_width + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (dst_width + 31) & ~31; + align_buffer_64(row, row_size * 2); uint8_t* rowptr = row; - int rowstride = kRowSize; + int rowstride = row_size; int lasty = yi; ScaleFilterCols(rowptr, src, dst_width, x, dx); @@ -1699,10 +1757,10 @@ void ScalePlaneBilinearUp_16(int src_width, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*InterpolateRow)(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; - void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + void (*ScaleFilterCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -1766,11 +1824,11 @@ void ScalePlaneBilinearUp_16(int src_width, const uint16_t* src = src_ptr + yi * (int64_t)src_stride; // Allocate 2 row buffers. - const int kRowSize = (dst_width + 31) & ~31; - align_buffer_64(row, kRowSize * 4); + const int row_size = (dst_width + 31) & ~31; + align_buffer_64(row, row_size * 4); uint16_t* rowptr = (uint16_t*)row; - int rowstride = kRowSize; + int rowstride = row_size; int lasty = yi; ScaleFilterCols(rowptr, src, dst_width, x, dx); @@ -1827,7 +1885,7 @@ static void ScalePlaneSimple(int src_width, const uint8_t* src_ptr, uint8_t* dst_ptr) { int i; - void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, + void (*ScaleCols)(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = ScaleCols_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -1864,7 +1922,7 @@ static void ScalePlaneSimple_16(int src_width, const uint16_t* src_ptr, uint16_t* dst_ptr) { int i; - void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, + void (*ScaleCols)(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = ScaleCols_16_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc index 317041f8..f6576874 100644 --- a/files/source/scale_any.cc +++ b/files/source/scale_any.cc @@ -128,6 +128,22 @@ SDODD(ScaleRowDown2Box_Odd_NEON, 1, 15) #endif +#ifdef HAS_SCALEUVROWDOWN2_NEON +SDANY(ScaleUVRowDown2_Any_NEON, + ScaleUVRowDown2_NEON, + ScaleUVRowDown2_C, + 2, + 2, + 7) +#endif +#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON +SDANY(ScaleUVRowDown2Linear_Any_NEON, + ScaleUVRowDown2Linear_NEON, + ScaleUVRowDown2Linear_C, + 2, + 2, + 7) +#endif #ifdef HAS_SCALEUVROWDOWN2BOX_NEON SDANY(ScaleUVRowDown2Box_Any_NEON, ScaleUVRowDown2Box_NEON, diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc index 9c3acf7f..ddd8d29e 100644 --- a/files/source/scale_argb.cc +++ b/files/source/scale_argb.cc @@ -58,9 +58,9 @@ static void ScaleARGBDown2(int src_width, assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. // Advance to odd row, even column. if (filtering == kFilterBilinear) { - src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; + src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4; } else { - src_argb += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 4; + src_argb += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 4; } #if defined(HAS_SCALEARGBROWDOWN2_SSE2) @@ -155,14 +155,14 @@ static void ScaleARGBDown4Box(int src_width, int dy) { int j; // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (dst_width * 2 * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; // Advance to odd row, even column. - src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; + src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4; (void)src_width; (void)src_height; (void)dx; @@ -187,9 +187,9 @@ static void ScaleARGBDown4Box(int src_width, for (j = 0; j < dst_height; ++j) { ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); - ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize, + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + row_size, dst_width * 2); - ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); + ScaleARGBRowDown2(row, row_size, dst_argb, dst_width); src_argb += row_stride; dst_argb += dst_stride; } @@ -214,7 +214,7 @@ static void ScaleARGBDownEven(int src_width, enum FilterMode filtering) { int j; int col_step = dx >> 16; - int row_stride = (dy >> 16) * (int64_t)src_stride; + ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride); void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, int src_step, uint8_t* dst_argb, int dst_width) = filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; @@ -222,7 +222,7 @@ static void ScaleARGBDownEven(int src_width, (void)src_height; assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); - src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; + src_argb += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 @@ -289,10 +289,10 @@ static void ScaleARGBBilinearDown(int src_width, int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; int64_t xlast = x + (int64_t)(dst_width - 1) * dx; @@ -348,6 +348,11 @@ static void ScaleARGBBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; @@ -388,7 +393,7 @@ static void ScaleARGBBilinearDown(int src_width, } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8_t* src = src_argb + yi * (int64_t)src_stride; + const uint8_t* src = src_argb + yi * (intptr_t)src_stride; if (filtering == kFilterLinear) { ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); } else { @@ -421,10 +426,10 @@ static void ScaleARGBBilinearUp(int src_width, int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; @@ -467,6 +472,11 @@ static void ScaleARGBBilinearUp(int src_width, InterpolateRow = InterpolateRow_LSX; } } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } #endif if (src_width >= 32768) { ScaleARGBFilterCols = @@ -545,14 +555,14 @@ static void ScaleARGBBilinearUp(int src_width, { int yi = y >> 16; - const uint8_t* src = src_argb + yi * (int64_t)src_stride; + const uint8_t* src = src_argb + yi * (intptr_t)src_stride; // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (dst_width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); uint8_t* rowptr = row; - int rowstride = kRowSize; + int rowstride = row_size; int lasty = yi; ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); @@ -570,7 +580,7 @@ static void ScaleARGBBilinearUp(int src_width, if (y > max_y) { y = max_y; yi = y >> 16; - src = src_argb + yi * (int64_t)src_stride; + src = src_argb + yi * (intptr_t)src_stride; } if (yi != lasty) { ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); @@ -659,6 +669,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_I422TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + I422ToARGBRow = I422ToARGBRow_Any_LSX; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_LSX; + } + } +#endif #if defined(HAS_I422TOARGBROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { I422ToARGBRow = I422ToARGBRow_Any_LASX; @@ -667,8 +685,13 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_I422TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422ToARGBRow = I422ToARGBRow_RVV; + } +#endif - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -711,8 +734,13 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif - void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; if (src_width >= 32768) { @@ -793,19 +821,19 @@ static void ScaleYUVToARGBBilinearUp(int src_width, const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. int yi = y >> 16; int uv_yi = yi >> kYShift; - const uint8_t* src_row_y = src_y + yi * (int64_t)src_stride_y; - const uint8_t* src_row_u = src_u + uv_yi * (int64_t)src_stride_u; - const uint8_t* src_row_v = src_v + uv_yi * (int64_t)src_stride_v; + const uint8_t* src_row_y = src_y + yi * (intptr_t)src_stride_y; + const uint8_t* src_row_u = src_u + uv_yi * (intptr_t)src_stride_u; + const uint8_t* src_row_v = src_v + uv_yi * (intptr_t)src_stride_v; // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 4 + 31) & ~31; - align_buffer_64(row, kRowSize * 2); + const int row_size = (dst_width * 4 + 31) & ~31; + align_buffer_64(row, row_size * 2); // Allocate 1 row of ARGB for source conversion. align_buffer_64(argb_row, src_width * 4); uint8_t* rowptr = row; - int rowstride = kRowSize; + int rowstride = row_size; int lasty = yi; // TODO(fbarchard): Convert first 2 rows of YUV to ARGB. @@ -833,9 +861,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, y = max_y; yi = y >> 16; uv_yi = yi >> kYShift; - src_row_y = src_y + yi * (int64_t)src_stride_y; - src_row_u = src_u + uv_yi * (int64_t)src_stride_u; - src_row_v = src_v + uv_yi * (int64_t)src_stride_v; + src_row_y = src_y + yi * (intptr_t)src_stride_y; + src_row_u = src_u + uv_yi * (intptr_t)src_stride_u; + src_row_v = src_v + uv_yi * (intptr_t)src_stride_v; } if (yi != lasty) { // TODO(fbarchard): Convert the clipped region of row. @@ -883,7 +911,7 @@ static void ScaleARGBSimple(int src_width, int y, int dy) { int j; - void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*ScaleARGBCols)(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; (void)src_height; @@ -926,7 +954,7 @@ static void ScaleARGBSimple(int src_width, } for (j = 0; j < dst_height; ++j) { - ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (int64_t)src_stride, + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (intptr_t)src_stride, dst_width, x, dx); dst_argb += dst_stride; y += dy; @@ -962,7 +990,7 @@ static void ScaleARGB(const uint8_t* src, // Negative src_height means invert the image. if (src_height < 0) { src_height = -src_height; - src = src + (src_height - 1) * (int64_t)src_stride; + src = src + (src_height - 1) * (intptr_t)src_stride; src_stride = -src_stride; } ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -977,7 +1005,7 @@ static void ScaleARGB(const uint8_t* src, if (clip_y) { int64_t clipf = (int64_t)(clip_y)*dy; y += (clipf & 0xffff); - src += (clipf >> 16) * (int64_t)src_stride; + src += (clipf >> 16) * (intptr_t)src_stride; dst += clip_y * dst_stride; } @@ -1011,7 +1039,7 @@ static void ScaleARGB(const uint8_t* src, filtering = kFilterNone; if (dx == 0x10000 && dy == 0x10000) { // Straight copy. - ARGBCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 4, + ARGBCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 4, src_stride, dst, dst_stride, clip_width, clip_height); return; } diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc index b02bdafd..77455903 100644 --- a/files/source/scale_common.cc +++ b/files/source/scale_common.cc @@ -23,6 +23,25 @@ namespace libyuv { extern "C" { #endif +#ifdef __cplusplus +#define STATIC_CAST(type, expr) static_cast(expr) +#else +#define STATIC_CAST(type, expr) (type)(expr) +#endif + +// TODO(fbarchard): make clamp255 preserve negative values. +static __inline int32_t clamp255(int32_t v) { + return (-(v >= 255) | v) & 255; +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +// TODO(fbarchard): change scale to bits +#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16) + static __inline int Abs(int v) { return v >= 0 ? v : -v; } @@ -62,6 +81,50 @@ void ScaleRowDown2_16_C(const uint16_t* src_ptr, } } +void ScaleRowDown2_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + int x; + (void)src_stride; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale)); + dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale)); + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale)); + } +} + +void ScaleRowDown2_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + int x; + (void)src_stride; + assert(scale >= 256); + assert(scale <= 32768); + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale)); + dst[1] = STATIC_CAST(uint8_t, C16TO8(src_ptr[3], scale)); + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[1], scale)); + dst += 1; + src_ptr += 2; + } + dst[0] = STATIC_CAST(uint8_t, C16TO8(src_ptr[0], scale)); +} + void ScaleRowDown2Linear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -98,6 +161,52 @@ void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, } } +void ScaleRowDown2Linear_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + const uint16_t* s = src_ptr; + int x; + (void)src_stride; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale)); + dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale)); + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale)); + } +} + +void ScaleRowDown2Linear_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + const uint16_t* s = src_ptr; + int x; + (void)src_stride; + assert(scale >= 256); + assert(scale <= 32768); + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale)); + dst[1] = STATIC_CAST(uint8_t, C16TO8((s[2] + s[3] + 1) >> 1, scale)); + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + s[1] + 1) >> 1, scale)); + dst += 1; + s += 2; + } + dst[0] = STATIC_CAST(uint8_t, C16TO8(s[0], scale)); +} + void ScaleRowDown2Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -160,6 +269,61 @@ void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, } } +void ScaleRowDown2Box_16To8_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, + C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale)); + dst[1] = STATIC_CAST(uint8_t, + C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale)); + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, + C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale)); + } +} + +void ScaleRowDown2Box_16To8_Odd_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width, + int scale) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + int x; + assert(scale >= 256); + assert(scale <= 32768); + dst_width -= 1; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = STATIC_CAST(uint8_t, + C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale)); + dst[1] = STATIC_CAST(uint8_t, + C16TO8((s[2] + s[3] + t[2] + t[3] + 2) >> 2, scale)); + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = STATIC_CAST(uint8_t, + C16TO8((s[0] + s[1] + t[0] + t[1] + 2) >> 2, scale)); + dst += 1; + s += 2; + t += 2; + } + dst[0] = STATIC_CAST(uint8_t, C16TO8((s[0] + t[0] + 1) >> 1, scale)); +} + void ScaleRowDown4_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -1116,18 +1280,13 @@ void ScaleUVRowDown2_C(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width) { - const uint16_t* src = (const uint16_t*)(src_uv); - uint16_t* dst = (uint16_t*)(dst_uv); int x; (void)src_stride; - for (x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[1]; - dst[1] = src[3]; - src += 2; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[1]; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = src_uv[2]; // Store the 2nd UV + dst_uv[1] = src_uv[3]; + src_uv += 4; + dst_uv += 2; } } @@ -1469,7 +1628,7 @@ void ScalePlaneVertical(int src_height, enum FilterMode filtering) { // TODO(fbarchard): Allow higher bpp. int dst_width_bytes = dst_width * bpp; - void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; @@ -1519,6 +1678,12 @@ void ScalePlaneVertical(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + for (j = 0; j < dst_height; ++j) { int yi; int yf; @@ -1548,7 +1713,7 @@ void ScalePlaneVertical_16(int src_height, enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; - void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb, + void (*InterpolateRow)(uint16_t* dst_argb, const uint16_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; @@ -1627,7 +1792,7 @@ void ScalePlaneVertical_16To8(int src_height, // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions. - void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb, + void (*InterpolateRow_16To8)(uint8_t* dst_argb, const uint16_t* src_argb, ptrdiff_t src_stride, int scale, int dst_width, int source_y_fraction) = InterpolateRow_16To8_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc index edaf2e29..17eeffad 100644 --- a/files/source/scale_gcc.cc +++ b/files/source/scale_gcc.cc @@ -1094,7 +1094,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 "m"(kLinearShuffleFar) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif @@ -1294,7 +1295,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "m"(kLinearMadd31) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc deleted file mode 100644 index 1226ef3e..00000000 --- a/files/source/scale_mmi.cc +++ /dev/null @@ -1,1168 +0,0 @@ -/* - * Copyright 2013 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/scale.h" - -#include -#include - -#include "libyuv/cpu_id.h" -#include "libyuv/planar_functions.h" // For CopyARGB -#include "libyuv/row.h" -#include "libyuv/scale_row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for Mips MMI. -#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -// clang-format off - -// CPU agnostic row functions -void ScaleRowDown2_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - const uint64_t shift = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlh %[src0], %[src0], %[shift] \n\t" - - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlh %[src1], %[src1], %[shift] \n\t" - - "packushb %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift) - : "memory"); -} - -void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest0, dest1; - - const uint64_t mask = 0x00ff00ff00ff00ffULL; - const uint64_t shift = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "and %[dest0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "and %[dest1], %[src1], %[mask] \n\t" - "packushb %[dest0], %[dest0], %[dest1] \n\t" - - "psrlh %[src0], %[src0], %[shift] \n\t" - "psrlh %[src1], %[src1], %[shift] \n\t" - "packushb %[dest1], %[src0], %[src1] \n\t" - - "pavgb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0), - [dest1] "=&f"(dest1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask), - [shift] "f"(shift), [width] "r"(dst_width) - : "memory"); -} - -void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* s = src_ptr; - const uint8_t* t = src_ptr + src_stride; - - uint64_t s0, s1, t0, t1; - uint64_t dest, dest0, dest1; - - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t mask = 0x00ff00ff00ff00ffULL; - const uint64_t shift0 = 0x2ULL; - const uint64_t shift1 = 0x8ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "psrlh %[s1], %[s0], %[shift1] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "psrlh %[t1], %[t0], %[shift1] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddh %[dest0], %[s0], %[s1] \n\t" - "paddh %[dest0], %[dest0], %[t0] \n\t" - "paddh %[dest0], %[dest0], %[t1] \n\t" - "paddh %[dest0], %[dest0], %[ph] \n\t" - "psrlh %[dest0], %[dest0], %[shift0] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "psrlh %[s1], %[s0], %[shift1] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "psrlh %[t1], %[t0], %[shift1] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddh %[dest1], %[s0], %[s1] \n\t" - "paddh %[dest1], %[dest1], %[t0] \n\t" - "paddh %[dest1], %[dest1], %[t1] \n\t" - "paddh %[dest1], %[dest1], %[ph] \n\t" - "psrlh %[dest1], %[dest1], %[shift0] \n\t" - - "packushb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), - [mask] "f"(mask) - : "memory"); -} - -void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpckhwd %[dest], %[src0], %[src1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" - "lwc1 %[src1], 0x08(%[src_ptr]) \n\t" - "punpcklwd %[dest_lo], %[src0], %[src1] \n\t" - "lwc1 %[src0], 0x04(%[src_ptr]) \n\t" - "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t" - "punpcklwd %[dest_hi], %[src0], %[src1] \n\t" - - "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - uint8_t* dst_argb, - int dst_width) { - const uint8_t* s = src_argb; - const uint8_t* t = src_argb + src_stride; - - uint64_t s0, s_hi, s_lo; - uint64_t t0, t_hi, t_lo; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t mask = 0x0ULL; - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t shfit = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "punpcklbh %[s_lo], %[s0], %[mask] \n\t" - "punpckhbh %[s_hi], %[s0], %[mask] \n\t" - "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "punpcklbh %[t_lo], %[t0], %[mask] \n\t" - "punpckhbh %[t_hi], %[t0], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t" - "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t" - - "paddh %[dest_lo], %[dest_lo], %[ph] \n\t" - "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "punpcklbh %[s_lo], %[s0], %[mask] \n\t" - "punpckhbh %[s_hi], %[s0], %[mask] \n\t" - "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "punpcklbh %[t_lo], %[t0], %[mask] \n\t" - "punpckhbh %[t_hi], %[t0], %[mask] \n\t" - "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t" - - "paddh %[dest_hi], %[dest_hi], %[ph] \n\t" - "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), - [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), - [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit) - : "memory"); -} - -void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - const uint64_t shift = 0x10ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - - "packsswh %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift) - : "memory"); -} - -void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpcklhw %[dest_lo], %[src0], %[src1] \n\t" - "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" - - "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t" - "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t" - - "pavgh %[dest], %[src0], %[src1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width) - : "memory"); -} - -void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* s = src_ptr; - const uint16_t* t = src_ptr + src_stride; - - uint64_t s0, s1, s_hi, s_lo; - uint64_t t0, t1, t_hi, t_lo; - uint64_t dest, dest0, dest1; - - const uint64_t ph = 0x0000000200000002ULL; - const uint64_t mask = 0x0000ffff0000ffffULL; - const uint64_t shift0 = 0x10ULL; - const uint64_t shift1 = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[s0], 0x00(%[s]) \n\t" - "gsldlc1 %[s0], 0x07(%[s]) \n\t" - "psrlw %[s1], %[s0], %[shift0] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x00(%[t]) \n\t" - "gsldlc1 %[t0], 0x07(%[t]) \n\t" - "psrlw %[t1], %[t0], %[shift0] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddw %[dest0], %[s0], %[s1] \n\t" - "paddw %[dest0], %[dest0], %[t0] \n\t" - "paddw %[dest0], %[dest0], %[t1] \n\t" - "paddw %[dest0], %[dest0], %[ph] \n\t" - "psrlw %[dest0], %[dest0], %[shift1] \n\t" - - "gsldrc1 %[s0], 0x08(%[s]) \n\t" - "gsldlc1 %[s0], 0x0f(%[s]) \n\t" - "psrlw %[s1], %[s0], %[shift0] \n\t" - "and %[s0], %[s0], %[mask] \n\t" - - "gsldrc1 %[t0], 0x08(%[t]) \n\t" - "gsldlc1 %[t0], 0x0f(%[t]) \n\t" - "psrlw %[t1], %[t0], %[shift0] \n\t" - "and %[t0], %[t0], %[mask] \n\t" - - "paddw %[dest1], %[s0], %[s1] \n\t" - "paddw %[dest1], %[dest1], %[t0] \n\t" - "paddw %[dest1], %[dest1], %[t1] \n\t" - "paddw %[dest1], %[dest1], %[ph] \n\t" - "psrlw %[dest1], %[dest1], %[shift1] \n\t" - - "packsswh %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[s], %[s], 0x10 \n\t" - "daddiu %[t], %[t], 0x10 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), - [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi), - [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), - [dest] "=&f"(dest) - : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), - [mask] "f"(mask) - : "memory"); -} - -void ScaleRowDown4_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t shift = 0x10ULL; - const uint64_t mask = 0x000000ff000000ffULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - "and %[src0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - "and %[src1], %[src1], %[mask] \n\t" - "packsswh %[dest_lo], %[src0], %[src1] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" - "psrlw %[src0], %[src0], %[shift] \n\t" - "and %[src0], %[src0], %[mask] \n\t" - "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" - "psrlw %[src1], %[src1], %[shift] \n\t" - "and %[src1], %[src1], %[mask] \n\t" - "packsswh %[dest_hi], %[src0], %[src1] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [shift] "f"(shift), [mask] "f"(mask) - : "memory"); -} - -void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1; - uint64_t dest, dest_hi, dest_lo; - - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "punpckhhw %[dest_lo], %[src0], %[src1] \n\t" - "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t" - - "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" - "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" - "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), - [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [mask] "f"(mask) - : "memory"); -} - -#define DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - -#define DO_SCALEROWDOWN4BOX_LOOP(reg) \ - "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ - "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ - \ - "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_PUNPCKADD() \ - \ - "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \ - "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \ - "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \ - "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \ - "paddh " #reg ", " #reg ", %[ph] \n\t" \ - "psrlh " #reg ", " #reg ", %[shift] \n\t" \ - \ - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ - "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" - -/* LibYUVScaleTest.ScaleDownBy4_Box */ -void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - const uint8_t* src0_ptr = src_ptr; - const uint8_t* src1_ptr = src_ptr + src_stride; - const uint8_t* src2_ptr = src_ptr + src_stride * 2; - const uint8_t* src3_ptr = src_ptr + src_stride * 3; - - uint64_t src, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; - - const uint64_t mask0 = 0x0ULL; - const uint64_t mask1 = 0x0001000100010001ULL; - const uint64_t ph = 0x0008000800080008ULL; - const uint64_t shift = 0x4ULL; - - __asm__ volatile( - "1: \n\t" - - DO_SCALEROWDOWN4BOX_LOOP(%[dest0]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest1]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest2]) - DO_SCALEROWDOWN4BOX_LOOP(%[dest3]) - - "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" - "packsswh %[dest_hi], %[dest2], %[dest3] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), - [ph] "f"(ph), [mask1] "f"(mask1) - : "memory"); -} - -#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - -#define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \ - "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ - "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ - "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ - \ - "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ - DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ - \ - "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \ - "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \ - "paddw %[dest], %[dest_hi], %[dest] \n\t" \ - "paddw %[dest], %[dest], %[ph] \n\t" \ - "psraw %[dest], %[dest], %[shift] \n\t" \ - "and " #reg ", %[dest], %[mask1] \n\t" \ - \ - "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ - "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" - -/* LibYUVScaleTest.ScaleDownBy4_Box_16 */ -void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* src0_ptr = src_ptr; - const uint16_t* src1_ptr = src_ptr + src_stride; - const uint16_t* src2_ptr = src_ptr + src_stride * 2; - const uint16_t* src3_ptr = src_ptr + src_stride * 3; - - uint64_t src, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; - - const uint64_t mask0 = 0x0ULL; - const uint64_t mask1 = 0x00000000ffffffffULL; - const uint64_t ph = 0x0000000800000008ULL; - const uint64_t shift = 0x04ULL; - - __asm__ volatile( - "1: \n\t" - - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2]) - DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3]) - "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t" - "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t" - - "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), - [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), - [ph] "f"(ph), [mask1] "f"(mask1) - : "memory"); -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx) { - uint64_t src, dest; - - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src], 0x00(%[src_ptr]) \n\t" - - "punpcklbh %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) - : "memory"); -} - -void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, - const uint16_t* src_ptr, - int dst_width, - int x, - int dx) { - uint64_t src, dest; - - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - - "punpcklhw %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "punpckhhw %[dest], %[src], %[src] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) - : "memory"); -} - -void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - uint64_t src, src_hi, src_lo, dest0, dest1; - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask] \n\t" - "punpckhbh %[src_hi], %[src], %[mask] \n\t" - - "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "paddush %[dest0], %[dest0], %[src_lo] \n\t" - "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "paddush %[dest1], %[dest1], %[src_hi] \n\t" - - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [src] "=&f"(src) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), - [mask] "f"(mask) - : "memory"); -} - -void ScaleAddRow_16_MMI(const uint16_t* src_ptr, - uint32_t* dst_ptr, - int src_width) { - uint64_t src, src_hi, src_lo, dest0, dest1; - const uint64_t mask = 0x0ULL; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "punpcklhw %[src_lo], %[src], %[mask] \n\t" - "punpckhhw %[src_hi], %[src], %[mask] \n\t" - - "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "paddw %[dest0], %[dest0], %[src_lo] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - - "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "paddw %[dest1], %[dest1], %[src_hi] \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [src] "=&f"(src) - : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), - [mask] "f"(mask) - : "memory"); -} - -void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - (void)src_stride; - - uint64_t src0, src1, dest; - - __asm__ volatile( - "1: \n\t" - "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" - "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" - "lwc1 %[src1], 0x00(%[src_ptr]) \n\t" - "punpcklwd %[dest], %[src0], %[src1] \n\t" - - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), - [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width) - : "memory"); -} - -void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8_t* dst_argb, - int dst_width) { - const uint8_t* src0_ptr = src_argb; - const uint8_t* src1_ptr = src_argb + src_stride; - - uint64_t src0, src1, src_hi, src_lo; - uint64_t dest, dest_hi, dest_lo, dest0, dest1; - - const uint64_t mask = 0x0ULL; - const uint64_t ph = 0x0002000200020002ULL; - const uint64_t shift = 0x2ULL; - - __asm__ volatile( - "1: \n\t" - - "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" - "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" - - "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src_lo], %[src1], %[mask] \n\t" - "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" - "punpcklbh %[src_hi], %[src1], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t" - "paddh %[dest0], %[dest0], %[ph] \n\t" - "psrlh %[dest0], %[dest0], %[shift] \n\t" - - "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" - "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" - - "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" - "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" - "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" - "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" - - "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" - "punpcklbh %[src_lo], %[src1], %[mask] \n\t" - "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" - "punpcklbh %[src_hi], %[src1], %[mask] \n\t" - "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" - "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" - "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t" - "paddh %[dest1], %[dest1], %[ph] \n\t" - "psrlh %[dest1], %[dest1], %[shift] \n\t" - - "packushb %[dest], %[dest0], %[dest1] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" - "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" - "daddi %[width], %[width], -0x02 \n\t" - "bnez %[width], 1b \n\t" - : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), - [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), - [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), - [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), - [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), - [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask), - [ph] "f"(ph) - : "memory"); -} - -// Scales a single row of pixels using point sampling. -void ScaleARGBCols_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - const uint32_t* src = (const uint32_t*)(src_argb); - uint32_t* dst = (uint32_t*)(dst_argb); - - const uint32_t* src_tmp; - - uint64_t dest, offset; - - const uint64_t shift0 = 16; - const uint64_t shift1 = 2; - - __asm__ volatile( - "1: \n\t" - "srav %[offset], %[x], %[shift0] \n\t" - "sllv %[offset], %[offset], %[shift1] \n\t" - "dadd %[src_tmp], %[src_ptr], %[offset] \n\t" - "lwc1 %[dest], 0x00(%[src_tmp]) \n\t" - "swc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "dadd %[x], %[x], %[dx] \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t" - "daddi %[width], %[width], -0x01 \n\t" - "bnez %[width], 1b \n\t" - : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp) - : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width), - [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1) - : "memory"); -} - -// Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - uint64_t src, dest0, dest1; - (void)x; - (void)dx; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" - "punpcklwd %[dest0], %[src], %[src] \n\t" - "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "punpckhwd %[dest1], %[src], %[src] \n\t" - "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src) - : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) - : "memory"); -} - -// Divide num by div and return as 16.16 fixed point result. -/* LibYUVBaseTest.TestFixedDiv */ -int FixedDiv_MIPS(int num, int div) { - int quotient = 0; - const int shift = 16; - - asm( - "dsll %[num], %[num], %[shift] \n\t" - "ddiv %[num], %[div] \t\n" - "mflo %[quo] \t\n" - : [quo] "+&r"(quotient) - : [num] "r"(num), [div] "r"(div), [shift] "r"(shift)); - - return quotient; -} - -// Divide num by div and return as 16.16 fixed point result. -/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */ -int FixedDiv1_MIPS(int num, int div) { - int quotient = 0; - const int shift = 16; - const int val1 = 1; - const int64_t val11 = 0x00010001ULL; - - asm( - "dsll %[num], %[num], %[shift] \n\t" - "dsub %[num], %[num], %[val11] \n\t" - "dsub %[div], %[div], %[val1] \n\t" - "ddiv %[num], %[div] \t\n" - "mflo %[quo] \t\n" - : [quo] "+&r"(quotient) - : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11), - [shift] "r"(shift)); - - return quotient; -} - -// Read 8x2 upsample with filtering and write 16x1. -// actually reads an extra pixel, so 9x2. -void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst, - int dst_width) { - const uint16_t* src2_ptr = src_ptr + src_stride; - - uint64_t src0, src1; - uint64_t dest, dest04, dest15, dest26, dest37; - uint64_t tmp0, tmp1, tmp2, tmp3; - - const uint64_t mask0 = 0x0003000900030009ULL; - const uint64_t mask1 = 0x0001000300010003ULL; - const uint64_t mask2 = 0x0009000300090003ULL; - const uint64_t mask3 = 0x0003000100030001ULL; - const uint64_t ph = 0x0000000800000008ULL; - const uint64_t shift = 4; - - __asm__ volatile( - "1: \n\t" - "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t" - "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t" - "pmaddhw %[dest04], %[src0], %[mask0] \n\t" - "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t" - "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t" - "pmaddhw %[dest], %[src1], %[mask1] \n\t" - "paddw %[dest04], %[dest04], %[dest] \n\t" - "paddw %[dest04], %[dest04], %[ph] \n\t" - "psrlw %[dest04], %[dest04], %[shift] \n\t" - - "pmaddhw %[dest15], %[src0], %[mask2] \n\t" - "pmaddhw %[dest], %[src1], %[mask3] \n\t" - "paddw %[dest15], %[dest15], %[dest] \n\t" - "paddw %[dest15], %[dest15], %[ph] \n\t" - "psrlw %[dest15], %[dest15], %[shift] \n\t" - - "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t" - "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t" - "pmaddhw %[dest26], %[src0], %[mask0] \n\t" - "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t" - "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t" - "pmaddhw %[dest], %[src1], %[mask1] \n\t" - "paddw %[dest26], %[dest26], %[dest] \n\t" - "paddw %[dest26], %[dest26], %[ph] \n\t" - "psrlw %[dest26], %[dest26], %[shift] \n\t" - - "pmaddhw %[dest37], %[src0], %[mask2] \n\t" - "pmaddhw %[dest], %[src1], %[mask3] \n\t" - "paddw %[dest37], %[dest37], %[dest] \n\t" - "paddw %[dest37], %[dest37], %[ph] \n\t" - "psrlw %[dest37], %[dest37], %[shift] \n\t" - - /* tmp0 = ( 00 04 02 06 ) */ - "packsswh %[tmp0], %[dest04], %[dest26] \n\t" - /* tmp1 = ( 01 05 03 07 ) */ - "packsswh %[tmp1], %[dest15], %[dest37] \n\t" - - /* tmp2 = ( 00 01 04 05 )*/ - "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t" - /* tmp3 = ( 02 03 06 07 )*/ - "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t" - - /* ( 00 01 02 03 ) */ - "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t" - "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - /* ( 04 05 06 07 ) */ - "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t" - "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" - "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x08 \n\t" - "bnez %[width], 1b \n\t" - : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04), - [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37), - [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), - [tmp3] "=&f"(tmp3), [dest] "=&f"(dest) - : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst), - [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1), - [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph) - : "memory"); -} - -void ScaleRowDown34_MMI(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst, - int dst_width) { - (void)src_stride; - assert((dst_width % 3 == 0) && (dst_width > 0)); - uint64_t src[2]; - uint64_t tmp[2]; - __asm__ volatile ( - "1: \n\t" - "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" - "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" - "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" - "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" - "and %[tmp1], %[src0], %[mask1] \n\t" - "psrlw %[tmp0], %[src0], %[rmov] \n\t" - "psllw %[tmp0], %[tmp0], %[lmov1] \n\t" - "or %[src0], %[tmp0], %[tmp1] \n\t" - "punpckhwd %[tmp0], %[src0], %[src0] \n\t" - "psllw %[tmp1], %[tmp0], %[rmov] \n\t" - "or %[src0], %[src0], %[tmp1] \n\t" - "psrlw %[tmp0], %[tmp0], %[rmov8] \n\t" - "pextrh %[tmp0], %[tmp0], %[zero] \n\t" - "pinsrh_2 %[src0], %[src0], %[tmp0] \n\t" - "pextrh %[tmp0], %[src1], %[zero] \n\t" - "pinsrh_3 %[src0], %[src0], %[tmp0] \n\t" - - "punpckhwd %[tmp0], %[src1], %[src1] \n\t" - "pextrh %[tmp1], %[tmp0], %[zero] \n\t" - "psrlw %[src1], %[src1], %[rmov] \n\t" - "psllw %[tmp1], %[tmp1], %[rmov8] \n\t" - "or %[src1], %[src1], %[tmp1] \n\t" - "and %[tmp0], %[tmp0], %[mask2] \n\t" - "or %[src1], %[src1], %[tmp0] \n\t" - - "gssdlc1 %[src0], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[src0], 0x00(%[dst_ptr]) \n\t" - "gsswlc1 %[src1], 0x0b(%[dst_ptr]) \n\t" - "gsswrc1 %[src1], 0x08(%[dst_ptr]) \n\t" - - "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x0c \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" - "bnez %[width], 1b \n\t" - - : [src0]"=&f"(src[0]), [src1]"=&f"(src[1]), - [tmp0]"=&f"(tmp[0]), [tmp1]"=&f"(tmp[1]) - : [src_ptr]"r"(src_ptr), [dst_ptr]"r"(dst), - [lmov]"f"(0xc), [rmov]"f"(0x18), - [mask1]"f"(0xffff0000ffff), [rmov8]"f"(0x8), - [zero]"f"(0x0), [mask2]"f"(0xff000000), - [width]"r"(dst_width), [lmov1]"f"(0x10) - : "memory" - ); -} -// clang-format on - -#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc index 6a0d6e1b..ccc75106 100644 --- a/files/source/scale_neon.cc +++ b/files/source/scale_neon.cc @@ -1428,6 +1428,45 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, #undef LOAD2_DATA32_LANE +void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.16 {q1}, [%1]! \n" // store 8 UV + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1"); +} + +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %2, %2, #8 \n" // 8 processed per loop. + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.16 {q0}, [%1]! \n" // store 8 UV + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1"); +} + void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc index 9f9636e6..ad06ee83 100644 --- a/files/source/scale_neon64.cc +++ b/files/source/scale_neon64.cc @@ -1568,6 +1568,45 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ); } +void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v1.8h}, [%1], #16 \n" // store 8 UV + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1"); +} + +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "st1 {v0.8h}, [%1], #16 \n" // store 8 UV + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1"); +} + void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, diff --git a/files/source/scale_uv.cc b/files/source/scale_uv.cc index 3b3d7b8e..1556071d 100644 --- a/files/source/scale_uv.cc +++ b/files/source/scale_uv.cc @@ -83,9 +83,9 @@ static void ScaleUVDown2(int src_width, assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. // Advance to odd row, even column. if (filtering == kFilterBilinear) { - src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; + src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2; } else { - src_uv += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 2; + src_uv += (y >> 16) * (intptr_t)src_stride + ((x >> 16) - 1) * 2; } #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) @@ -112,6 +112,22 @@ static void ScaleUVDown2(int src_width, } } #endif +#if defined(HAS_SCALEUVROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON + : ScaleUVRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON + : ScaleUVRowDown2Box_NEON); + } + } +#endif // This code is not enabled. Only box filter is available at this time. #if defined(HAS_SCALEUVROWDOWN2_SSSE3) @@ -130,23 +146,7 @@ static void ScaleUVDown2(int src_width, } } #endif -// This code is not enabled. Only box filter is available at this time. -#if defined(HAS_SCALEUVROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_Any_NEON - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON - : ScaleUVRowDown2Box_Any_NEON); - if (IS_ALIGNED(dst_width, 8)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_NEON - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON - : ScaleUVRowDown2Box_NEON); - } - } -#endif + #if defined(HAS_SCALEUVROWDOWN2_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleUVRowDown2 = @@ -193,14 +193,14 @@ static void ScaleUVDown4Box(int src_width, int dy) { int j; // Allocate 2 rows of UV. - const int kRowSize = (dst_width * 2 * 2 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); + const int row_size = (dst_width * 2 * 2 + 15) & ~15; + align_buffer_64(row, row_size * 2); int row_stride = src_stride * (dy >> 16); void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width) = ScaleUVRowDown2Box_C; // Advance to odd row, even column. - src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; + src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2; (void)src_width; (void)src_height; (void)dx; @@ -234,9 +234,9 @@ static void ScaleUVDown4Box(int src_width, for (j = 0; j < dst_height; ++j) { ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2); - ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize, + ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + row_size, dst_width * 2); - ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width); + ScaleUVRowDown2(row, row_size, dst_uv, dst_width); src_uv += row_stride; dst_uv += dst_stride; } @@ -263,7 +263,7 @@ static void ScaleUVDownEven(int src_width, enum FilterMode filtering) { int j; int col_step = dx >> 16; - int row_stride = (dy >> 16) * (int64_t)src_stride; + ptrdiff_t row_stride = (ptrdiff_t)((dy >> 16) * (intptr_t)src_stride); void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride, int src_step, uint8_t* dst_uv, int dst_width) = filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C; @@ -271,7 +271,7 @@ static void ScaleUVDownEven(int src_width, (void)src_height; assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); - src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; + src_uv += (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2; #if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 @@ -338,10 +338,10 @@ static void ScaleUVBilinearDown(int src_width, int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, + void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, + void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C; int64_t xlast = x + (int64_t)(dst_width - 1) * dx; @@ -397,6 +397,11 @@ static void ScaleUVBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEUVFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; @@ -429,7 +434,7 @@ static void ScaleUVBilinearDown(int src_width, } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8_t* src = src_uv + yi * (int64_t)src_stride; + const uint8_t* src = src_uv + yi * (intptr_t)src_stride; if (filtering == kFilterLinear) { ScaleUVFilterCols(dst_uv, src, dst_width, x, dx); } else { @@ -464,10 +469,10 @@ static void ScaleUVBilinearUp(int src_width, int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, + void (*InterpolateRow)(uint8_t* dst_uv, const uint8_t* src_uv, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, + void (*ScaleUVFilterCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) = filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; const int max_y = (src_height - 1) << 16; @@ -510,6 +515,11 @@ static void ScaleUVBilinearUp(int src_width, InterpolateRow = InterpolateRow_LSX; } } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } #endif if (src_width >= 32768) { ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; @@ -571,14 +581,14 @@ static void ScaleUVBilinearUp(int src_width, { int yi = y >> 16; - const uint8_t* src = src_uv + yi * (int64_t)src_stride; + const uint8_t* src = src_uv + yi * (intptr_t)src_stride; // Allocate 2 rows of UV. - const int kRowSize = (dst_width * 2 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); + const int row_size = (dst_width * 2 + 15) & ~15; + align_buffer_64(row, row_size * 2); uint8_t* rowptr = row; - int rowstride = kRowSize; + int rowstride = row_size; int lasty = yi; ScaleUVFilterCols(rowptr, src, dst_width, x, dx); @@ -596,7 +606,7 @@ static void ScaleUVBilinearUp(int src_width, if (y > max_y) { y = max_y; yi = y >> 16; - src = src_uv + yi * (int64_t)src_stride; + src = src_uv + yi * (intptr_t)src_stride; } if (yi != lasty) { ScaleUVFilterCols(rowptr, src, dst_width, x, dx); @@ -644,32 +654,32 @@ void ScaleUVLinearUp2(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 +#ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3; } #endif -#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 +#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2; } #endif -#ifdef HAS_SCALEUVROWUP2LINEAR_NEON +#ifdef HAS_SCALEUVROWUP2_LINEAR_NEON if (TestCpuFlag(kCpuHasNEON)) { ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON; } #endif if (dst_height == 1) { - ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, + ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv, dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; for (i = 0; i < dst_height; ++i) { - ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width); + ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width); dst_uv += dst_stride; y += dy; } @@ -697,19 +707,19 @@ void ScaleUVBilinearUp2(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 +#ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3; } #endif -#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 +#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2; } #endif -#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON +#ifdef HAS_SCALEUVROWUP2_BILINEAR_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON; } @@ -751,32 +761,32 @@ void ScaleUVLinearUp2_16(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41 +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41; } #endif -#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; } #endif -#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON +#ifdef HAS_SCALEUVROWUP2_LINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; } #endif if (dst_height == 1) { - ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, + ScaleRowUp(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride, dst_uv, dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; for (i = 0; i < dst_height; ++i) { - ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width); + ScaleRowUp(src_uv + (y >> 16) * (intptr_t)src_stride, dst_uv, dst_width); dst_uv += dst_stride; y += dy; } @@ -804,19 +814,19 @@ void ScaleUVBilinearUp2_16(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41 +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41 if (TestCpuFlag(kCpuHasSSE41)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41; } #endif -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; } #endif -#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON +#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; } @@ -854,7 +864,7 @@ static void ScaleUVSimple(int src_width, int y, int dy) { int j; - void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width, + void (*ScaleUVCols)(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; (void)src_height; @@ -889,7 +899,7 @@ static void ScaleUVSimple(int src_width, } for (j = 0; j < dst_height; ++j) { - ScaleUVCols(dst_uv, src_uv + (y >> 16) * (int64_t)src_stride, dst_width, x, + ScaleUVCols(dst_uv, src_uv + (y >> 16) * (intptr_t)src_stride, dst_width, x, dx); dst_uv += dst_stride; y += dy; @@ -910,7 +920,7 @@ static int UVCopy(const uint8_t* src_uv, // Negative height means invert the image. if (height < 0) { height = -height; - src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv; + src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv; src_stride_uv = -src_stride_uv; } @@ -930,7 +940,7 @@ static int UVCopy_16(const uint16_t* src_uv, // Negative height means invert the image. if (height < 0) { height = -height; - src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv; + src_uv = src_uv + (height - 1) * (intptr_t)src_stride_uv; src_stride_uv = -src_stride_uv; } @@ -968,7 +978,7 @@ static void ScaleUV(const uint8_t* src, // Negative src_height means invert the image. if (src_height < 0) { src_height = -src_height; - src = src + (src_height - 1) * (int64_t)src_stride; + src = src + (src_height - 1) * (intptr_t)src_stride; src_stride = -src_stride; } ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, @@ -983,7 +993,7 @@ static void ScaleUV(const uint8_t* src, if (clip_y) { int64_t clipf = (int64_t)(clip_y)*dy; y += (clipf & 0xffff); - src += (clipf >> 16) * (int64_t)src_stride; + src += (clipf >> 16) * (intptr_t)src_stride; dst += clip_y * dst_stride; } @@ -1024,7 +1034,7 @@ static void ScaleUV(const uint8_t* src, #ifdef HAS_UVCOPY if (dx == 0x10000 && dy == 0x10000) { // Straight copy. - UVCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 2, + UVCopy(src + (y >> 16) * (intptr_t)src_stride + (x >> 16) * 2, src_stride, dst, dst_stride, clip_width, clip_height); return; } @@ -1039,7 +1049,7 @@ static void ScaleUV(const uint8_t* src, dst_stride, src, dst, x, y, dy, /*bpp=*/2, filtering); return; } - if (filtering && (dst_width + 1) / 2 == src_width) { + if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) { ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst); return; @@ -1118,7 +1128,7 @@ int UVScale_16(const uint16_t* src_uv, // Negative src_height means invert the image. if (src_height < 0) { src_height = -src_height; - src_uv = src_uv + (src_height - 1) * (int64_t)src_stride_uv; + src_uv = src_uv + (src_height - 1) * (intptr_t)src_stride_uv; src_stride_uv = -src_stride_uv; } src_width = Abs(src_width); @@ -1126,20 +1136,20 @@ int UVScale_16(const uint16_t* src_uv, #ifdef HAS_UVCOPY if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) { if (dst_height == 1) { - UVCopy_16(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride_uv, + UVCopy_16(src_uv + ((src_height - 1) / 2) * (intptr_t)src_stride_uv, src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height); } else { dy = src_height / dst_height; - UVCopy_16(src_uv + ((dy - 1) / 2) * (int64_t)src_stride_uv, - dy * (int64_t)src_stride_uv, dst_uv, dst_stride_uv, dst_width, - dst_height); + UVCopy_16(src_uv + ((dy - 1) / 2) * (intptr_t)src_stride_uv, + (int)(dy * (intptr_t)src_stride_uv), dst_uv, dst_stride_uv, + dst_width, dst_height); } return 0; } #endif - if (filtering && (dst_width + 1) / 2 == src_width) { + if ((filtering == kFilterLinear) && ((dst_width + 1) / 2 == src_width)) { ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height, src_stride_uv, dst_stride_uv, src_uv, dst_uv); return 0; diff --git a/files/tools_libyuv/autoroller/roll_deps.py b/files/tools_libyuv/autoroller/roll_deps.py index 977c86de..2b57eb65 100755 --- a/files/tools_libyuv/autoroller/roll_deps.py +++ b/files/tools_libyuv/autoroller/roll_deps.py @@ -1,18 +1,14 @@ #!/usr/bin/env vpython3 -# Copyright 2017 The LibYuv Project Authors. All rights reserved. +# Copyright (c) 2017 The LibYUV project authors. All Rights Reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found -# in the file PATENTS. All contributing project authors may +# in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. +"""Script to automatically roll dependencies in the LibYUV DEPS file.""" -# This is a modified copy of the script in -# https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py -# customized for libyuv. - -"""Script to automatically roll dependencies in the libyuv DEPS file.""" import argparse import base64 @@ -25,9 +21,46 @@ import sys import urllib.request +def FindSrcDirPath(): + """Returns the abs path to the src/ dir of the project.""" + src_dir = os.path.dirname(os.path.abspath(__file__)) + while os.path.basename(src_dir) != 'src': + src_dir = os.path.normpath(os.path.join(src_dir, os.pardir)) + return src_dir + + # Skip these dependencies (list without solution name prefix). DONT_AUTOROLL_THESE = [ - 'src/third_party/gflags/src', + 'src/third_party/gflags/src', + 'src/third_party/mockito/src', +] + +# These dependencies are missing in chromium/src/DEPS, either unused or already +# in-tree. For instance, src/base is a part of the Chromium source git repo, +# but we pull it through a subtree mirror, so therefore it isn't listed in +# Chromium's deps but it is in ours. +LIBYUV_ONLY_DEPS = [ + 'src/base', + 'src/build', + 'src/buildtools', + 'src/ios', + 'src/testing', + 'src/third_party', + 'src/third_party/android_support_test_runner', + 'src/third_party/bazel', + 'src/third_party/bouncycastle', + 'src/third_party/errorprone/lib', + 'src/third_party/findbugs', + 'src/third_party/gson', + 'src/third_party/gtest-parallel', + 'src/third_party/guava', + 'src/third_party/intellij', + 'src/third_party/jsr-305/src', + 'src/third_party/ow2_asm', + 'src/third_party/proguard', + 'src/third_party/ub-uiautomator/lib', + 'src/tools', + 'src/tools/clang/dsymutil', ] LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv' @@ -37,16 +70,22 @@ CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s' CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s' COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$') -CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z-]+)\'$') +CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([-0-9a-z]+)\'$') ROLL_BRANCH_NAME = 'roll_chromium_revision' SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir, - os.pardir)) +CHECKOUT_SRC_DIR = FindSrcDirPath() CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir)) +# Copied from tools/android/roll/android_deps/.../BuildConfigGenerator.groovy. +ANDROID_DEPS_START = r'=== ANDROID_DEPS Generated Code Start ===' +ANDROID_DEPS_END = r'=== ANDROID_DEPS Generated Code End ===' +# Location of automically gathered android deps. +ANDROID_DEPS_PATH = 'src/third_party/android_deps/' + sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build')) -import find_depot_tools # pylint: disable=wrong-import-position +import find_depot_tools + find_depot_tools.add_depot_tools_to_path() CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py' @@ -56,11 +95,26 @@ CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools', DepsEntry = collections.namedtuple('DepsEntry', 'path url revision') ChangedDep = collections.namedtuple('ChangedDep', 'path url current_rev new_rev') +CipdDepsEntry = collections.namedtuple('CipdDepsEntry', 'path packages') +VersionEntry = collections.namedtuple('VersionEntry', 'version') +ChangedCipdPackage = collections.namedtuple( + 'ChangedCipdPackage', 'path package current_version new_version') +ChangedVersionEntry = collections.namedtuple( + 'ChangedVersionEntry', 'path current_version new_version') + +ChromiumRevisionUpdate = collections.namedtuple('ChromiumRevisionUpdate', + ('current_chromium_rev ' + 'new_chromium_rev ')) + class RollError(Exception): pass +def StrExpansion(): + return lambda str_value: str_value + + def VarLookup(local_scope): return lambda var_name: local_scope['vars'][var_name] @@ -68,9 +122,9 @@ def VarLookup(local_scope): def ParseDepsDict(deps_content): local_scope = {} global_scope = { - 'Var': VarLookup(local_scope), - 'Str': lambda s: s, - 'deps_os': {}, + 'Str': StrExpansion(), + 'Var': VarLookup(local_scope), + 'deps_os': {}, } exec(deps_content, global_scope, local_scope) return local_scope @@ -82,11 +136,6 @@ def ParseLocalDepsFile(filename): return ParseDepsDict(deps_content) -def ParseRemoteCrDepsFile(revision): - deps_content = ReadRemoteCrFile('DEPS', revision) - return ParseDepsDict(deps_content) - - def ParseCommitPosition(commit_message): for line in reversed(commit_message.splitlines()): m = COMMIT_POSITION_RE.match(line.strip()) @@ -97,15 +146,18 @@ def ParseCommitPosition(commit_message): sys.exit(-1) -def _RunCommand(command, working_dir=None, ignore_exit_code=False, - extra_env=None, input_data=None): +def _RunCommand(command, + working_dir=None, + ignore_exit_code=False, + extra_env=None, + input_data=None): """Runs a command and returns the output from that command. - If the command fails (exit code != 0), the function will exit the process. + If the command fails (exit code != 0), the function will exit the process. - Returns: - A tuple containing the stdout and stderr outputs as strings. - """ + Returns: + A tuple containing the stdout and stderr outputs as strings. + """ working_dir = working_dir or CHECKOUT_SRC_DIR logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir) env = os.environ.copy() @@ -134,9 +186,9 @@ def _RunCommand(command, working_dir=None, ignore_exit_code=False, def _GetBranches(): """Returns a tuple of active,branches. - The 'active' is the name of the currently active branch and 'branches' is a - list of all branches. - """ + The 'active' is the name of the currently active branch and 'branches' is a + list of all branches. + """ lines = _RunCommand(['git', 'branch'])[0].split('\n') branches = [] active = '' @@ -160,9 +212,16 @@ def _ReadGitilesContent(url): def ReadRemoteCrFile(path_below_src, revision): - """Reads a remote Chromium file of a specific revision. Returns a string.""" - return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision, - path_below_src)) + """Reads a remote Chromium file of a specific revision. + + Args: + path_below_src: A path to the target file relative to src dir. + revision: Revision to read. + Returns: + A string with file content. + """ + return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % + (revision, path_below_src)) def ReadRemoteCrCommit(revision): @@ -171,7 +230,13 @@ def ReadRemoteCrCommit(revision): def ReadUrlContent(url): - """Connect to a remote host and read the contents. Returns a list of lines.""" + """Connect to a remote host and read the contents. + + Args: + url: URL to connect to. + Returns: + A list of lines. + """ conn = urllib.request.urlopen(url) try: return conn.readlines() @@ -185,52 +250,172 @@ def ReadUrlContent(url): def GetMatchingDepsEntries(depsentry_dict, dir_path): """Gets all deps entries matching the provided path. - This list may contain more than one DepsEntry object. - Example: dir_path='src/testing' would give results containing both - 'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS. - Example 2: dir_path='src/build' should return 'src/build' but not - 'src/buildtools'. + This list may contain more than one DepsEntry object. + Example: dir_path='src/testing' would give results containing both + 'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's + DEPS. + Example 2: dir_path='src/build' should return 'src/build' but not + 'src/buildtools'. - Returns: - A list of DepsEntry objects. - """ + Returns: + A list of DepsEntry objects. + """ result = [] for path, depsentry in depsentry_dict.items(): if path == dir_path: result.append(depsentry) else: parts = path.split('/') - if all(part == parts[i] - for i, part in enumerate(dir_path.split('/'))): + if all(part == parts[i] for i, part in enumerate(dir_path.split('/'))): result.append(depsentry) return result + def BuildDepsentryDict(deps_dict): """Builds a dict of paths to DepsEntry objects from a raw deps dict.""" result = {} def AddDepsEntries(deps_subdict): - for path, deps_url_spec in deps_subdict.items(): - if isinstance(deps_url_spec, dict): - if deps_url_spec.get('dep_type') == 'cipd': - continue - deps_url = deps_url_spec['url'] + for path, dep in deps_subdict.items(): + if path in result: + continue + if not isinstance(dep, dict): + dep = {'url': dep} + if dep.get('dep_type') == 'cipd': + result[path] = CipdDepsEntry(path, dep['packages']) else: - deps_url = deps_url_spec - if not path in result: - url, revision = deps_url.split('@') if deps_url else (None, None) + if '@' not in dep['url']: + continue + url, revision = dep['url'].split('@') result[path] = DepsEntry(path, url, revision) + def AddVersionEntry(vars_subdict): + for key, value in vars_subdict.items(): + if key in result: + continue + if not key.endswith('_version'): + continue + key = re.sub('_version$', '', key) + result[key] = VersionEntry(value) + AddDepsEntries(deps_dict['deps']) - for deps_os in ['win', 'mac', 'linux', 'android', 'ios', 'unix']: + for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']: AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {})) + AddVersionEntry(deps_dict.get('vars', {})) return result +def _FindChangedCipdPackages(path, old_pkgs, new_pkgs): + old_pkgs_names = {p['package'] for p in old_pkgs} + new_pkgs_names = {p['package'] for p in new_pkgs} + pkgs_equal = (old_pkgs_names == new_pkgs_names) + added_pkgs = [p for p in new_pkgs_names if p not in old_pkgs_names] + removed_pkgs = [p for p in old_pkgs_names if p not in new_pkgs_names] + + assert pkgs_equal, ('Old: %s\n New: %s.\nYou need to do a manual roll ' + 'and remove/add entries in DEPS so the old and new ' + 'list match.\nMost likely, you should add \"%s\" and ' + 'remove \"%s\"' % + (old_pkgs, new_pkgs, added_pkgs, removed_pkgs)) + + for old_pkg in old_pkgs: + for new_pkg in new_pkgs: + old_version = old_pkg['version'] + new_version = new_pkg['version'] + if (old_pkg['package'] == new_pkg['package'] + and old_version != new_version): + logging.debug('Roll dependency %s to %s', path, new_version) + yield ChangedCipdPackage(path, old_pkg['package'], old_version, + new_version) + + +def _FindChangedVars(name, old_version, new_version): + if old_version != new_version: + logging.debug('Roll dependency %s to %s', name, new_version) + yield ChangedVersionEntry(name, old_version, new_version) + + +def _FindNewDeps(old, new): + """ Gather dependencies only in `new` and return corresponding paths. """ + old_entries = set(BuildDepsentryDict(old)) + new_entries = set(BuildDepsentryDict(new)) + return [ + path for path in new_entries - old_entries + if path not in DONT_AUTOROLL_THESE + ] + + +def FindAddedDeps(libyuv_deps, new_cr_deps): + """ + Calculate new deps entries of interest. + + Ideally, that would mean: only appearing in chromium DEPS + but transitively used in LibYUV. + + Since it's hard to compute, we restrict ourselves to a well defined subset: + deps sitting in `ANDROID_DEPS_PATH`. + Otherwise, assumes that's a Chromium-only dependency. + + Args: + libyuv_deps: dict of deps as defined in the LibYUV DEPS file. + new_cr_deps: dict of deps as defined in the chromium DEPS file. + + Caveat: Doesn't detect a new package in existing dep. + + Returns: + A tuple consisting of: + A list of paths added dependencies sitting in `ANDROID_DEPS_PATH`. + A list of paths for other added dependencies. + """ + all_added_deps = _FindNewDeps(libyuv_deps, new_cr_deps) + generated_android_deps = [ + path for path in all_added_deps if path.startswith(ANDROID_DEPS_PATH) + ] + other_deps = [ + path for path in all_added_deps if path not in generated_android_deps + ] + return generated_android_deps, other_deps + + +def FindRemovedDeps(libyuv_deps, new_cr_deps): + """ + Calculate obsolete deps entries. + + Ideally, that would mean: no more appearing in chromium DEPS + and not used in LibYUV. + + Since it's hard to compute: + 1/ We restrict ourselves to a well defined subset: + deps sitting in `ANDROID_DEPS_PATH`. + 2/ We rely on existing behavior of CalculateChangeDeps. + I.e. Assumes non-CIPD dependencies are LibYUV-only, don't remove them. + + Args: + libyuv_deps: dict of deps as defined in the LibYUV DEPS file. + new_cr_deps: dict of deps as defined in the chromium DEPS file. + + Caveat: Doesn't detect a deleted package in existing dep. + + Returns: + A tuple consisting of: + A list of paths of dependencies removed from `ANDROID_DEPS_PATH`. + A list of paths of unexpected disappearing dependencies. + """ + all_removed_deps = _FindNewDeps(new_cr_deps, libyuv_deps) + generated_android_deps = sorted( + [path for path in all_removed_deps if path.startswith(ANDROID_DEPS_PATH)]) + # Webrtc-only dependencies are handled in CalculateChangedDeps. + other_deps = sorted([ + path for path in all_removed_deps + if path not in generated_android_deps and path not in LIBYUV_ONLY_DEPS + ]) + return generated_android_deps, other_deps + + def CalculateChangedDeps(libyuv_deps, new_cr_deps): """ - Calculate changed deps entries based on entries defined in the libyuv DEPS - file: + Calculate changed deps entries based on entries defined in the LibYUV DEPS + file: - If a shared dependency with the Chromium DEPS file: roll it to the same revision as Chromium (i.e. entry in the new_cr_deps dict) - If it's a Chromium sub-directory, roll it to the HEAD revision (notice @@ -239,9 +424,9 @@ def CalculateChangedDeps(libyuv_deps, new_cr_deps): - If it's another DEPS entry (not shared with Chromium), roll it to HEAD unless it's configured to be skipped. - Returns: - A list of ChangedDep objects representing the changed deps. - """ + Returns: + A list of ChangedDep objects representing the changed deps. + """ result = [] libyuv_entries = BuildDepsentryDict(libyuv_deps) new_cr_entries = BuildDepsentryDict(new_cr_deps) @@ -250,68 +435,117 @@ def CalculateChangedDeps(libyuv_deps, new_cr_deps): continue cr_deps_entry = new_cr_entries.get(path) if cr_deps_entry: + assert type(cr_deps_entry) is type(libyuv_deps_entry) + + if isinstance(cr_deps_entry, CipdDepsEntry): + result.extend( + _FindChangedCipdPackages(path, libyuv_deps_entry.packages, + cr_deps_entry.packages)) + continue + + if isinstance(cr_deps_entry, VersionEntry): + result.extend( + _FindChangedVars(path, libyuv_deps_entry.version, + cr_deps_entry.version)) + continue + # Use the revision from Chromium's DEPS file. new_rev = cr_deps_entry.revision assert libyuv_deps_entry.url == cr_deps_entry.url, ( - 'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' % - (path, libyuv_deps_entry.url, cr_deps_entry.url)) + 'LibYUV DEPS entry %s has a different URL %s than Chromium %s.' % + (path, libyuv_deps_entry.url, cr_deps_entry.url)) else: - # Use the HEAD of the deps repo. - stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url, - 'HEAD']) - new_rev = stdout.strip().split('\t')[0] + if isinstance(libyuv_deps_entry, DepsEntry): + # Use the HEAD of the deps repo. + stdout, _ = _RunCommand( + ['git', 'ls-remote', libyuv_deps_entry.url, 'HEAD']) + new_rev = stdout.strip().split('\t')[0] + else: + # The dependency has been removed from chromium. + # This is handled by FindRemovedDeps. + continue # Check if an update is necessary. if libyuv_deps_entry.revision != new_rev: logging.debug('Roll dependency %s to %s', path, new_rev) - result.append(ChangedDep(path, libyuv_deps_entry.url, - libyuv_deps_entry.revision, new_rev)) + result.append( + ChangedDep(path, libyuv_deps_entry.url, libyuv_deps_entry.revision, + new_rev)) return sorted(result) def CalculateChangedClang(new_cr_rev): + def GetClangRev(lines): for line in lines: match = CLANG_REVISION_RE.match(line) if match: return match.group(1) - raise RollError('Could not parse Clang revision from:\n' + '\n'.join(' ' + l for l in lines)) + raise RollError('Could not parse Clang revision!') with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'r') as f: current_lines = f.readlines() current_rev = GetClangRev(current_lines) new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH, - new_cr_rev).splitlines() + new_cr_rev).splitlines() new_rev = GetClangRev(new_clang_update_py) return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev) -def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos, - new_commit_pos, changed_deps_list, clang_change): - current_cr_rev = current_cr_rev[0:10] - new_cr_rev = new_cr_rev[0:10] +def GenerateCommitMessage( + rev_update, + current_commit_pos, + new_commit_pos, + changed_deps_list, + added_deps_paths=None, + removed_deps_paths=None, + clang_change=None, +): + current_cr_rev = rev_update.current_chromium_rev[0:10] + new_cr_rev = rev_update.new_chromium_rev[0:10] rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev) git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos) - commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval, - git_number_interval)] - commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval)) - commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE % - rev_interval)) + commit_msg = [ + 'Roll chromium_revision %s (%s)\n' % (rev_interval, git_number_interval), + 'Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval), + 'Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE % rev_interval) + ] + + def Section(adjective, deps): + noun = 'dependency' if len(deps) == 1 else 'dependencies' + commit_msg.append('%s %s' % (adjective, noun)) + if changed_deps_list: - commit_msg.append('Changed dependencies:') + Section('Changed', changed_deps_list) for c in changed_deps_list: - commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url, - c.current_rev[0:10], - c.new_rev[0:10])) + if isinstance(c, ChangedCipdPackage): + commit_msg.append('* %s: %s..%s' % + (c.path, c.current_version, c.new_version)) + elif isinstance(c, ChangedVersionEntry): + commit_msg.append('* %s_vesion: %s..%s' % + (c.path, c.current_version, c.new_version)) + else: + commit_msg.append('* %s: %s/+log/%s..%s' % + (c.path, c.url, c.current_rev[0:10], c.new_rev[0:10])) + + if added_deps_paths: + Section('Added', added_deps_paths) + commit_msg.extend('* %s' % p for p in added_deps_paths) + + if removed_deps_paths: + Section('Removed', removed_deps_paths) + commit_msg.extend('* %s' % p for p in removed_deps_paths) + + if any([changed_deps_list, added_deps_paths, removed_deps_paths]): change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS') commit_msg.append('DEPS diff: %s\n' % change_url) else: commit_msg.append('No dependencies changed.') - if clang_change.current_rev != clang_change.new_rev: + if clang_change and clang_change.current_rev != clang_change.new_rev: commit_msg.append('Clang version changed %s:%s' % (clang_change.current_rev, clang_change.new_rev)) change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, @@ -320,38 +554,61 @@ def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos, else: commit_msg.append('No update to Clang.\n') - # TBR needs to be non-empty for Gerrit to process it. - git_author = _RunCommand(['git', 'config', 'user.email'], - working_dir=CHECKOUT_SRC_DIR)[0].strip() - commit_msg.append('TBR=%s' % git_author) - commit_msg.append('BUG=None') return '\n'.join(commit_msg) -def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision, - changed_deps): +def UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content): """Update the DEPS file with the new revision.""" - # Update the chromium_revision variable. with open(deps_filename, 'rb') as deps_file: deps_content = deps_file.read().decode('utf-8') - deps_content = deps_content.replace(old_cr_revision, new_cr_revision) + + # Update the chromium_revision variable. + deps_content = deps_content.replace(rev_update.current_chromium_rev, + rev_update.new_chromium_rev) + + # Add and remove dependencies. For now: only generated android deps. + # Since gclient cannot add or remove deps, we on the fact that + # these android deps are located in one place we can copy/paste. + deps_re = re.compile(ANDROID_DEPS_START + '.*' + ANDROID_DEPS_END, re.DOTALL) + new_deps = deps_re.search(new_cr_content) + old_deps = deps_re.search(deps_content) + if not new_deps or not old_deps: + faulty = 'Chromium' if not new_deps else 'LibYUV' + raise RollError('Was expecting to find "%s" and "%s"\n' + 'in %s DEPS' % + (ANDROID_DEPS_START, ANDROID_DEPS_END, faulty)) + deps_content = deps_re.sub(new_deps.group(0), deps_content) + + for dep in changed_deps: + if isinstance(dep, ChangedVersionEntry): + deps_content = deps_content.replace(dep.current_version, dep.new_version) + with open(deps_filename, 'wb') as deps_file: deps_file.write(deps_content.encode('utf-8')) # Update each individual DEPS entry. for dep in changed_deps: + # ChangedVersionEntry types are already been processed. + if isinstance(dep, ChangedVersionEntry): + continue local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path) if not os.path.isdir(local_dep_dir): raise RollError( - 'Cannot find local directory %s. Make sure the .gclient file\n' - 'contains all platforms in the target_os list, i.e.\n' + 'Cannot find local directory %s. Either run\n' + 'gclient sync --deps=all\n' + 'or make sure the .gclient file for your solution contains all ' + 'platforms in the target_os list, i.e.\n' 'target_os = ["android", "unix", "mac", "ios", "win"];\n' 'Then run "gclient sync" again.' % local_dep_dir) - _RunCommand( - ['gclient', 'setdep', '--revision', '%s@%s' % (dep.path, dep.new_rev)], - working_dir=CHECKOUT_SRC_DIR) + if isinstance(dep, ChangedCipdPackage): + package = dep.package.format() # Eliminate double curly brackets + update = '%s:%s@%s' % (dep.path, package, dep.new_version) + else: + update = '%s@%s' % (dep.path, dep.new_rev) + _RunCommand(['gclient', 'setdep', '--revision', update], + working_dir=CHECKOUT_SRC_DIR) def _IsTreeClean(): @@ -363,9 +620,9 @@ def _IsTreeClean(): return False -def _EnsureUpdatedMasterBranch(dry_run): - current_branch = _RunCommand( - ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0] +def _EnsureUpdatedMainBranch(dry_run): + current_branch = _RunCommand(['git', 'rev-parse', '--abbrev-ref', + 'HEAD'])[0].splitlines()[0] if current_branch != 'main': logging.error('Please checkout the main branch and re-run this script.') if not dry_run: @@ -407,19 +664,34 @@ def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos): return 2 -def _UploadCL(commit_queue_mode): +def _GetCcRecipients(changed_deps_list): + """Returns a list of emails to notify based on the changed deps list. + """ + cc_recipients = [] + for c in changed_deps_list: + pass + return cc_recipients + + +def _UploadCL(commit_queue_mode, add_cc=None): """Upload the committed changes as a changelist to Gerrit. - commit_queue_mode: - - 2: Submit to commit queue. - - 1: Run trybots but do not submit to CQ. - - 0: Skip CQ, upload only. - """ - cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail'] + commit_queue_mode: + - 2: Submit to commit queue. + - 1: Run trybots but do not submit to CQ. + - 0: Skip CQ, upload only. + + add_cc: A list of email addresses to add as CC recipients. + """ + cc_recipients = [] + if add_cc: + cc_recipients.extend(add_cc) + cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks'] if commit_queue_mode >= 2: logging.info('Sending the CL to the CQ...') cmd.extend(['-o', 'label=Bot-Commit+1']) cmd.extend(['-o', 'label=Commit-Queue+2']) + cmd.extend(['--send-mail', '--cc', ','.join(cc_recipients)]) elif commit_queue_mode >= 1: logging.info('Starting CQ dry run...') cmd.extend(['-o', 'label=Commit-Queue+1']) @@ -429,31 +701,57 @@ def _UploadCL(commit_queue_mode): } stdout, stderr = _RunCommand(cmd, extra_env=extra_env) logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s', - stdout, stderr) + stdout, stderr) + + +def GetRollRevisionRanges(opts, libyuv_deps): + current_cr_rev = libyuv_deps['vars']['chromium_revision'] + new_cr_rev = opts.revision + if not new_cr_rev: + stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD']) + head_rev = stdout.strip().split('\t')[0] + logging.info('No revision specified. Using HEAD: %s', head_rev) + new_cr_rev = head_rev + + return ChromiumRevisionUpdate(current_cr_rev, new_cr_rev) def main(): p = argparse.ArgumentParser() - p.add_argument('--clean', action='store_true', default=False, + p.add_argument('--clean', + action='store_true', + default=False, help='Removes any previous local roll branch.') - p.add_argument('-r', '--revision', + p.add_argument('-r', + '--revision', help=('Chromium Git revision to roll to. Defaults to the ' 'Chromium HEAD revision if omitted.')) - p.add_argument('--dry-run', action='store_true', default=False, + p.add_argument('--dry-run', + action='store_true', + default=False, help=('Calculate changes and modify DEPS, but don\'t create ' 'any local branch, commit, upload CL or send any ' 'tryjobs.')) - p.add_argument('-i', '--ignore-unclean-workdir', action='store_true', + p.add_argument('-i', + '--ignore-unclean-workdir', + action='store_true', default=False, help=('Ignore if the current branch is not main or if there ' 'are uncommitted changes (default: %(default)s).')) grp = p.add_mutually_exclusive_group() - grp.add_argument('--skip-cq', action='store_true', default=False, + grp.add_argument('--skip-cq', + action='store_true', + default=False, help='Skip sending the CL to the CQ (default: %(default)s)') - grp.add_argument('--cq-over', type=int, default=1, + grp.add_argument('--cq-over', + type=int, + default=1, help=('Commit queue dry run if the revision difference ' 'is below this number (default: %(default)s)')) - p.add_argument('-v', '--verbose', action='store_true', default=False, + p.add_argument('-v', + '--verbose', + action='store_true', + default=False, help='Be extra verbose in printing of log messages.') opts = p.parse_args() @@ -470,38 +768,52 @@ def main(): _RemovePreviousRollBranch(opts.dry_run) if not opts.ignore_unclean_workdir: - _EnsureUpdatedMasterBranch(opts.dry_run) - - new_cr_rev = opts.revision - if not new_cr_rev: - stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD']) - head_rev = stdout.strip().split('\t')[0] - logging.info('No revision specified. Using HEAD: %s', head_rev) - new_cr_rev = head_rev + _EnsureUpdatedMainBranch(opts.dry_run) deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS') libyuv_deps = ParseLocalDepsFile(deps_filename) - current_cr_rev = libyuv_deps['vars']['chromium_revision'] - current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev)) - new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev)) + rev_update = GetRollRevisionRanges(opts, libyuv_deps) - new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev) + current_commit_pos = ParseCommitPosition( + ReadRemoteCrCommit(rev_update.current_chromium_rev)) + new_commit_pos = ParseCommitPosition( + ReadRemoteCrCommit(rev_update.new_chromium_rev)) + + new_cr_content = ReadRemoteCrFile('DEPS', rev_update.new_chromium_rev) + new_cr_deps = ParseDepsDict(new_cr_content) changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps) - clang_change = CalculateChangedClang(new_cr_rev) - commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev, - current_commit_pos, new_commit_pos, - changed_deps, clang_change) + # Discard other deps, assumed to be chromium-only dependencies. + new_generated_android_deps, _ = FindAddedDeps(libyuv_deps, new_cr_deps) + removed_generated_android_deps, other_deps = FindRemovedDeps( + libyuv_deps, new_cr_deps) + if other_deps: + raise RollError('LibYUV DEPS entries are missing from Chromium: %s.\n' + 'Remove them or add them to either ' + 'LIBYUV_ONLY_DEPS or DONT_AUTOROLL_THESE.' % other_deps) + clang_change = CalculateChangedClang(rev_update.new_chromium_rev) + commit_msg = GenerateCommitMessage( + rev_update, + current_commit_pos, + new_commit_pos, + changed_deps, + added_deps_paths=new_generated_android_deps, + removed_deps_paths=removed_generated_android_deps, + clang_change=clang_change) logging.debug('Commit message:\n%s', commit_msg) _CreateRollBranch(opts.dry_run) - UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps) - _LocalCommit(commit_msg, opts.dry_run) - commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over, - current_commit_pos, new_commit_pos) - logging.info('Uploading CL...') if not opts.dry_run: - _UploadCL(commit_queue_mode) + UpdateDepsFile(deps_filename, rev_update, changed_deps, new_cr_content) + if _IsTreeClean(): + logging.info("No DEPS changes detected, skipping CL creation.") + else: + _LocalCommit(commit_msg, opts.dry_run) + commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over, + current_commit_pos, new_commit_pos) + logging.info('Uploading CL...') + if not opts.dry_run: + _UploadCL(commit_queue_mode, _GetCcRecipients(changed_deps)) return 0 diff --git a/files/unit_test/convert_test.cc b/files/unit_test/convert_test.cc index 1f975825..1f1896b0 100644 --- a/files/unit_test/convert_test.cc +++ b/files/unit_test/convert_test.cc @@ -48,6 +48,7 @@ namespace libyuv { #define AR30ToAR30 ARGBCopy #define ABGRToABGR ARGBCopy +// subsample amount uses a divide. #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) // Planar test @@ -180,9 +181,12 @@ TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12) TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10) +TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10) TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12) +TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I420, uint8_t, 1, 2, 2, 12) TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12) +TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I420, uint8_t, 1, 2, 2, 12) TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12) // Test Android 420 to I420 @@ -417,131 +421,136 @@ TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10) TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12) TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) -#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ - DOY, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ - TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ - static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ - static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ - "SRC_SUBSAMP_X unsupported"); \ - static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ - "SRC_SUBSAMP_Y unsupported"); \ - static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ - "DST_SUBSAMP_X unsupported"); \ - static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ - "DST_SUBSAMP_Y unsupported"); \ - const int kWidth = W1280; \ - const int kHeight = benchmark_height_; \ - const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ - const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ - const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ - const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \ - const int kPaddedHeight = \ - (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \ - const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \ - const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \ - align_buffer_page_end( \ - src_uv, \ - 2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ - align_buffer_page_end(dst_uv_c, \ - 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ - align_buffer_page_end(dst_uv_opt, \ - 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ - SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ - SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ - for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) { \ - src_y_p[i] = \ - (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ - } \ - for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \ - src_uv_p[i] = \ - (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ - } \ - memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ - memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ - memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ - memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \ - DOY ? reinterpret_cast(dst_y_c) : NULL, kWidth, \ - reinterpret_cast(dst_uv_c), 2 * kDstHalfWidth, kWidth, \ - NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \ - DOY ? reinterpret_cast(dst_y_opt) : NULL, kWidth, \ - reinterpret_cast(dst_uv_opt), 2 * kDstHalfWidth, kWidth, \ - NEG kHeight); \ - } \ - if (DOY) { \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ - } \ - } \ - } \ - for (int i = 0; i < kDstHalfHeight; ++i) { \ - for (int j = 0; j < 2 * kDstHalfWidth; ++j) { \ - EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j], \ - dst_uv_opt[i * 2 * kDstHalfWidth + j]); \ - } \ - } \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_uv_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_uv_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_uv); \ +#define TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, W1280, N, NEG, OFF, DOY, SRC_DEPTH, \ + TILE_WIDTH, TILE_HEIGHT) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ + static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ + "SRC_SUBSAMP_X unsupported"); \ + static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ + "SRC_SUBSAMP_Y unsupported"); \ + static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ + "DST_SUBSAMP_X unsupported"); \ + static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ + "DST_SUBSAMP_Y unsupported"); \ + const int kWidth = W1280; \ + const int kHeight = benchmark_height_; \ + const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ + const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ + const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ + const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \ + const int kPaddedHeight = \ + (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \ + const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \ + const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \ + align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \ + align_buffer_page_end( \ + src_uv, \ + 2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_uv_c, \ + 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_uv_opt, \ + 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ + SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ + for (int i = 0; \ + i < kPaddedWidth * kPaddedHeight * SRC_BPC / (int)sizeof(SRC_T); \ + ++i) { \ + src_y_p[i] = \ + (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ + } \ + for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2 * \ + SRC_BPC / (int)sizeof(SRC_T); \ + ++i) { \ + src_uv_p[i] = \ + (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ + } \ + memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ + memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ + memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p, \ + 2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T), \ + DOY ? reinterpret_cast(dst_y_c) : NULL, kWidth, \ + reinterpret_cast(dst_uv_c), 2 * kDstHalfWidth, kWidth, \ + NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p, \ + 2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T), \ + DOY ? reinterpret_cast(dst_y_opt) : NULL, kWidth, \ + reinterpret_cast(dst_uv_opt), 2 * kDstHalfWidth, kWidth, \ + NEG kHeight); \ + } \ + if (DOY) { \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ + } \ + } \ + } \ + for (int i = 0; i < kDstHalfHeight; ++i) { \ + for (int j = 0; j < 2 * kDstHalfWidth; ++j) { \ + EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j], \ + dst_uv_opt[i * 2 * kDstHalfWidth + j]); \ + } \ + } \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_uv_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_uv_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ } -#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, \ - TILE_HEIGHT) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1, \ - SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, 1, \ - SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1, \ - SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, \ - TILE_WIDTH, TILE_HEIGHT) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0, \ - SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) - -TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1) -TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1) -TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1) -TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1) -TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1) -TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1) -TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1) -TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1) -TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1) -TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1) -TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32) - -#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ - SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ +#define TESTBPTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ + TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_ + 1, _Any, +, 0, 1, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ + TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 2, 1, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ + TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0, 1, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ + TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ + TESTBPTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_, _NullY, +, 0, 0, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) + +TESTBPTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1) +TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1) +TESTBPTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1) +TESTBPTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1) +TESTBPTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1) +TESTBPTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1) +TESTBPTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1) +TESTBPTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1) +TESTBPTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1) +TESTBPTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1) +TESTBPTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32) +TESTBPTOBP(MT2T, uint8_t, 10 / 8, 2, 2, P010, uint16_t, 2, 2, 2, 10, 16, 32) + +#define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, W1280, N, NEG, OFF, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ @@ -621,30 +630,30 @@ TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32) free_aligned_buffer_page_end(src_uv); \ } -#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ - DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, \ - TILE_HEIGHT) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, \ - TILE_WIDTH, TILE_HEIGHT) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, \ - SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH, \ - TILE_WIDTH, TILE_HEIGHT) \ - TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ - SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH, \ - TILE_WIDTH, TILE_HEIGHT) - -TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1) -TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1) -TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32) +#define TESTBPTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \ + TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ + TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 2, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ + TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0, SRC_DEPTH, TILE_WIDTH, \ + TILE_HEIGHT) \ + TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) + +TESTBPTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1) +TESTBPTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1) +TESTBPTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32) +TESTBPTOP(P010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10, 1, 1) +TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1) // Provide matrix wrappers for full range bt.709 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \ @@ -680,6 +689,12 @@ TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32) #define I422ToARGBFilter(a, b, c, d, e, f, g, h, i, j) \ I422ToARGBMatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \ kFilterBilinear) +#define I420ToRGB24Filter(a, b, c, d, e, f, g, h, i, j) \ + I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \ + kFilterBilinear) +#define I422ToRGB24Filter(a, b, c, d, e, f, g, h, i, j) \ + I420ToRGB24MatrixFilter(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j, \ + kFilterBilinear) #define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN)) @@ -792,8 +807,12 @@ TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1) TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1) +TESTPLANARTOB(I422, 1, 1, RGB24, 3, 3, 1) +TESTPLANARTOB(I422, 1, 1, RAW, 3, 3, 1) TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1) TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1) +TESTPLANARTOB(I444, 1, 1, RGB24, 3, 3, 1) +TESTPLANARTOB(I444, 1, 1, RAW, 3, 3, 1) TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1) TESTPLANARTOB(J444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1) @@ -816,6 +835,8 @@ TESTPLANARTOB(H420, 2, 2, AB30, 4, 4, 1) #endif TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1) +TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1) +TESTPLANARTOB(I422, 2, 2, RGB24Filter, 3, 3, 1) #else TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1) @@ -832,14 +853,15 @@ TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1) TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1) TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1) TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1) -TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1) TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1) +TESTPLANARTOB(I420, 2, 2, ARGBFilter, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, ARGBFilter, 4, 4, 1) +TESTPLANARTOB(I420, 2, 2, RGB24Filter, 3, 3, 1) TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1) #endif @@ -1056,8 +1078,8 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ARGBFilter, 4, 4, 1) TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1) #endif -#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ - BPP_B, W1280, N, NEG, OFF) \ +#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ @@ -1110,15 +1132,15 @@ TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1) free_aligned_buffer_page_end(dst_argb32_opt); \ } -#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_ + 1, _Any, +, 0) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_, _Unaligned, +, 2) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_, _Invert, -, 0) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_, _Opt, +, 0) +#define TESTBPTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \ + TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + benchmark_width_ + 1, _Any, +, 0) \ + TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + benchmark_width_, _Unaligned, +, 2) \ + TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + benchmark_width_, _Invert, -, 0) \ + TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + benchmark_width_, _Opt, +, 0) #define JNV12ToARGB(a, b, c, d, e, f, g, h) \ NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) @@ -1139,29 +1161,29 @@ TESTQPLANARTOB(I422Alpha, 2, 1, ARGBFilter, 4, 4, 1) #define JNV12ToRGB565(a, b, c, d, e, f, g, h) \ NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) -TESTBIPLANARTOB(JNV12, 2, 2, ARGB, ARGB, 4) -TESTBIPLANARTOB(JNV21, 2, 2, ARGB, ARGB, 4) -TESTBIPLANARTOB(JNV12, 2, 2, ABGR, ABGR, 4) -TESTBIPLANARTOB(JNV21, 2, 2, ABGR, ABGR, 4) -TESTBIPLANARTOB(JNV12, 2, 2, RGB24, RGB24, 3) -TESTBIPLANARTOB(JNV21, 2, 2, RGB24, RGB24, 3) -TESTBIPLANARTOB(JNV12, 2, 2, RAW, RAW, 3) -TESTBIPLANARTOB(JNV21, 2, 2, RAW, RAW, 3) +TESTBPTOB(JNV12, 2, 2, ARGB, ARGB, 4) +TESTBPTOB(JNV21, 2, 2, ARGB, ARGB, 4) +TESTBPTOB(JNV12, 2, 2, ABGR, ABGR, 4) +TESTBPTOB(JNV21, 2, 2, ABGR, ABGR, 4) +TESTBPTOB(JNV12, 2, 2, RGB24, RGB24, 3) +TESTBPTOB(JNV21, 2, 2, RGB24, RGB24, 3) +TESTBPTOB(JNV12, 2, 2, RAW, RAW, 3) +TESTBPTOB(JNV21, 2, 2, RAW, RAW, 3) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTBIPLANARTOB(JNV12, 2, 2, RGB565, RGB565, 2) +TESTBPTOB(JNV12, 2, 2, RGB565, RGB565, 2) #endif -TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4) -TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4) -TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4) -TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4) -TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3) -TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3) -TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3) -TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3) -TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3) +TESTBPTOB(NV12, 2, 2, ARGB, ARGB, 4) +TESTBPTOB(NV21, 2, 2, ARGB, ARGB, 4) +TESTBPTOB(NV12, 2, 2, ABGR, ABGR, 4) +TESTBPTOB(NV21, 2, 2, ABGR, ABGR, 4) +TESTBPTOB(NV12, 2, 2, RGB24, RGB24, 3) +TESTBPTOB(NV21, 2, 2, RGB24, RGB24, 3) +TESTBPTOB(NV12, 2, 2, RAW, RAW, 3) +TESTBPTOB(NV21, 2, 2, RAW, RAW, 3) +TESTBPTOB(NV21, 2, 2, YUV24, RAW, 3) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2) +TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2) #endif #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ @@ -1236,6 +1258,8 @@ TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1) TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2) TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1) +TESTATOPLANAR(ABGR, 4, 1, J420, 2, 2) +TESTATOPLANAR(ABGR, 4, 1, J422, 2, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2) TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2) @@ -1254,8 +1278,84 @@ TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1) TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2) TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1) -#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \ - SUBSAMP_Y, W1280, N, NEG, OFF) \ +#define TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, \ + SUBSAMP_Y, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ + const int kWidth = W1280; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ + align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ + align_buffer_page_end(dst_a_c, kWidth* kHeight); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_c, \ + kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_a_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_opt, \ + kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_a_c, 1, kWidth* kHeight); \ + memset(dst_y_c, 2, kWidth* kHeight); \ + memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_a_opt, 101, kWidth* kHeight); \ + memset(dst_y_opt, 102, kWidth* kHeight); \ + memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kStride; ++j) \ + src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \ + kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2, \ + dst_a_c, kWidth, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \ + dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \ + kStrideUV * 2, dst_a_opt, kWidth, kWidth, \ + NEG kHeight); \ + } \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ + EXPECT_EQ(dst_a_c[i * kWidth + j], dst_a_opt[i * kWidth + j]); \ + } \ + } \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \ + for (int j = 0; j < kStrideUV; ++j) { \ + EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \ + } \ + } \ + free_aligned_buffer_page_end(dst_a_c); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_uv_c); \ + free_aligned_buffer_page_end(dst_a_opt); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_uv_opt); \ + free_aligned_buffer_page_end(src_argb); \ + } + +#if defined(ENABLE_FULL_TESTS) +#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ + 1, _Any, +, 0) \ + TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 2) \ + TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0) \ + TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0) +#else +#define TESTATOPLANARA(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ + 1, _Any, +, 0) \ + TESTATOPLANARAI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0) +#endif + +TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2) + +#define TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ @@ -1301,25 +1401,25 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1) free_aligned_buffer_page_end(src_argb); \ } -#define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ + 1, _Any, +, 0) \ - TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Unaligned, +, 2) \ - TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Invert, -, 0) \ - TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_, _Opt, +, 0) - -TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2) -TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2) -TESTATOBIPLANAR(ABGR, 1, 4, NV12, 2, 2) -TESTATOBIPLANAR(ABGR, 1, 4, NV21, 2, 2) -TESTATOBIPLANAR(RAW, 1, 3, JNV21, 2, 2) -TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2) -TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) -TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2) -TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) +#define TESTATOBP(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ + 1, _Any, +, 0) \ + TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 2) \ + TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0) \ + TESTATOBPI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0) + +TESTATOBP(ARGB, 1, 4, NV12, 2, 2) +TESTATOBP(ARGB, 1, 4, NV21, 2, 2) +TESTATOBP(ABGR, 1, 4, NV12, 2, 2) +TESTATOBP(ABGR, 1, 4, NV21, 2, 2) +TESTATOBP(RAW, 1, 3, JNV21, 2, 2) +TESTATOBP(YUY2, 2, 4, NV12, 2, 2) +TESTATOBP(UYVY, 2, 4, NV12, 2, 2) +TESTATOBP(AYUV, 1, 4, NV12, 2, 2) +TESTATOBP(AYUV, 1, 4, NV21, 2, 2) #define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF) \ @@ -1440,6 +1540,7 @@ TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) @@ -1450,7 +1551,7 @@ TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1) #endif TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1) -TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1) // 4 +TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1) TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) @@ -1484,6 +1585,127 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) +// in place test +#define TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ + EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ + const int kWidth = W1280; \ + const int kHeight = benchmark_height_; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, \ + kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_argb_c, \ + kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_argb_opt, \ + kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ + for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ + src_argb[i + OFF] = (fastrand() & 0xff); \ + } \ + memcpy(dst_argb_c + OFF, src_argb, \ + kStrideA * kHeightA * (int)sizeof(TYPE_A)); \ + memcpy(dst_argb_opt + OFF, src_argb, \ + kStrideA * kHeightA * (int)sizeof(TYPE_A)); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B((TYPE_A*)(dst_argb_c /* src */ + OFF), kStrideA, \ + (TYPE_B*)dst_argb_c, kStrideB, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA, \ + (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \ + } \ + memcpy(dst_argb_opt + OFF, src_argb, \ + kStrideA * kHeightA * (int)sizeof(TYPE_A)); \ + FMT_A##To##FMT_B((TYPE_A*)(dst_argb_opt /* src */ + OFF), kStrideA, \ + (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \ + for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTATOA(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ + EPP_B, STRIDE_B, HEIGHT_B) \ + TESTATOAI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_, _Inplace, +, 0) + +TESTATOA(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOA(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOA(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) +#endif +TESTATOA(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOA(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1) +#endif +TESTATOA(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOA(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) +TESTATOA(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +#endif +TESTATOA(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOA(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) +#endif +TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1) +// TODO(fbarchard): Support in place for mirror. +// TESTATOA(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) +TESTATOA(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) +TESTATOA(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) +TESTATOA(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTATOA(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1) +#endif +TESTATOA(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1) +TESTATOA(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1) +// TODO(fbarchard): Support in place for conversions that increase bpp. +// TESTATOA(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) +// TESTATOA(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOA(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +// TESTATOA(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOA(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1) +// TESTATOA(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1) +// TESTATOA(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOA(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1) +// TESTATOA(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1) +// TESTATOA(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1) +TESTATOA(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1) +// TESTATOA(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOA(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) +// TESTATOA(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1) +TESTATOA(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) +#ifdef LITTLE_ENDIAN_ONLY_TEST +// TESTATOA(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) +#endif +TESTATOA(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +// TESTATOA(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) +// TESTATOA(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOA(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1) +// TESTATOA(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) +// TESTATOA(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +// TESTATOA(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) +// TESTATOA(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +TESTATOA(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOA(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOA(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOA(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOA(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) + #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \ @@ -2065,6 +2287,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_u, half_width * half_height); @@ -2099,6 +2324,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } // Convert to NV21 align_buffer_page_end(dst_y, width * height); @@ -2158,6 +2386,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } // Convert to NV12 align_buffer_page_end(dst_y, width * height); @@ -2217,6 +2448,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); @@ -2247,6 +2481,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); @@ -2282,6 +2519,9 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV21_422) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); @@ -2312,6 +2552,9 @@ TEST_F(LibYUVConvertTest, DISABLED_TestMJPGToNV12_422) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); @@ -2346,6 +2589,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); @@ -2376,6 +2622,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); @@ -2410,6 +2659,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); @@ -2440,6 +2692,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) { int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); @@ -2472,6 +2727,9 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) { int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); + if (benchmark_iterations < 1) { + benchmark_iterations = 1; + } align_buffer_page_end(dst_argb, width * height * 4); for (int times = 0; times < benchmark_iterations; ++times) { @@ -2921,6 +3179,51 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4) TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12) TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12) +TEST_F(LibYUVConvertTest, MM21ToYUY2) { + const int kWidth = (benchmark_width_ + 15) & (~15); + const int kHeight = (benchmark_height_ + 31) & (~31); + + align_buffer_page_end(orig_y, kWidth * kHeight); + align_buffer_page_end(orig_uv, + 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); + + align_buffer_page_end(tmp_y, kWidth * kHeight); + align_buffer_page_end(tmp_u, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); + align_buffer_page_end(tmp_v, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); + + align_buffer_page_end(dst_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight); + align_buffer_page_end(golden_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight); + + MemRandomize(orig_y, kWidth * kHeight); + MemRandomize(orig_uv, 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); + + /* Convert MM21 to YUY2 in 2 steps for reference */ + libyuv::MM21ToI420(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2), tmp_y, + kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v, + SUBSAMPLE(kWidth, 2), kWidth, kHeight); + libyuv::I420ToYUY2(tmp_y, kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v, + SUBSAMPLE(kWidth, 2), golden_yuyv, + 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); + + /* Convert to NV12 */ + for (int i = 0; i < benchmark_iterations_; ++i) { + libyuv::MM21ToYUY2(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2), + dst_yuyv, 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); + } + + for (int i = 0; i < 4 * SUBSAMPLE(kWidth, 2) * kHeight; ++i) { + EXPECT_EQ(dst_yuyv[i], golden_yuyv[i]); + } + + free_aligned_buffer_page_end(orig_y); + free_aligned_buffer_page_end(orig_uv); + free_aligned_buffer_page_end(tmp_y); + free_aligned_buffer_page_end(tmp_u); + free_aligned_buffer_page_end(tmp_v); + free_aligned_buffer_page_end(dst_yuyv); + free_aligned_buffer_page_end(golden_yuyv); +} + // Transitive test. A to B to C is same as A to C. // Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere. #define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ @@ -3353,6 +3656,8 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \ I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define I012ToAB30(a, b, c, d, e, f, g, h, i, j) \ + I012ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \ I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) @@ -3495,6 +3800,7 @@ TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1) +TESTPLANAR16TOB(I012, 2, 2, 0xfff, AB30, 4, 4, 1) TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30Filter, 4, 4, 1) TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1) #endif // LITTLE_ENDIAN_ONLY_TEST @@ -3733,8 +4039,8 @@ TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGBFilter, 4, 4, 1, 10) TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10) #endif // DISABLE_SLOW_TESTS -#define TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \ +#define TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ @@ -3777,16 +4083,16 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10) free_aligned_buffer_page_end(dst_argb_opt); \ } -#define TESTBIPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - ALIGN, YALIGN, S_DEPTH) \ - TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH) \ - TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH) \ - TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ - TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) +#define TESTBP16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, S_DEPTH) \ + TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \ + benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH) \ + TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \ + benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH) \ + TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \ + benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ + TESTBP16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, YALIGN, \ + benchmark_width_, _Opt, +, 0, 0, S_DEPTH) #define P010ToARGB(a, b, c, d, e, f, g, h) \ P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) @@ -3829,23 +4135,23 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10) kFilterBilinear) #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) -TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10) -TESTBIPLANAR16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10) -TESTBIPLANAR16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12) -TESTBIPLANAR16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12) -TESTBIPLANAR16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16) -TESTBIPLANAR16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16) -TESTBIPLANAR16TOB(P010, 2, 2, ARGBFilter, 4, 4, 1, 10) -TESTBIPLANAR16TOB(P210, 2, 1, ARGBFilter, 4, 4, 1, 10) +TESTBP16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10) +TESTBP16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10) +TESTBP16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12) +TESTBP16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12) +TESTBP16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16) +TESTBP16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16) +TESTBP16TOB(P010, 2, 2, ARGBFilter, 4, 4, 1, 10) +TESTBP16TOB(P210, 2, 1, ARGBFilter, 4, 4, 1, 10) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTBIPLANAR16TOB(P010, 2, 2, AR30, 4, 4, 1, 10) -TESTBIPLANAR16TOB(P210, 2, 1, AR30, 4, 4, 1, 10) -TESTBIPLANAR16TOB(P012, 2, 2, AR30, 4, 4, 1, 12) -TESTBIPLANAR16TOB(P212, 2, 1, AR30, 4, 4, 1, 12) -TESTBIPLANAR16TOB(P016, 2, 2, AR30, 4, 4, 1, 16) -TESTBIPLANAR16TOB(P216, 2, 1, AR30, 4, 4, 1, 16) -TESTBIPLANAR16TOB(P010, 2, 2, AR30Filter, 4, 4, 1, 10) -TESTBIPLANAR16TOB(P210, 2, 1, AR30Filter, 4, 4, 1, 10) +TESTBP16TOB(P010, 2, 2, AR30, 4, 4, 1, 10) +TESTBP16TOB(P210, 2, 1, AR30, 4, 4, 1, 10) +TESTBP16TOB(P012, 2, 2, AR30, 4, 4, 1, 12) +TESTBP16TOB(P212, 2, 1, AR30, 4, 4, 1, 12) +TESTBP16TOB(P016, 2, 2, AR30, 4, 4, 1, 16) +TESTBP16TOB(P216, 2, 1, AR30, 4, 4, 1, 16) +TESTBP16TOB(P010, 2, 2, AR30Filter, 4, 4, 1, 10) +TESTBP16TOB(P210, 2, 1, AR30Filter, 4, 4, 1, 10) #endif // LITTLE_ENDIAN_ONLY_TEST #endif // DISABLE_SLOW_TESTS diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc index 080778f5..93867fa7 100644 --- a/files/unit_test/cpu_test.cc +++ b/files/unit_test/cpu_test.cc @@ -20,13 +20,23 @@ namespace libyuv { TEST_F(LibYUVBaseTest, TestCpuHas) { int cpu_flags = TestCpuFlag(-1); - printf("Cpu Flags %d\n", cpu_flags); + printf("Cpu Flags 0x%x\n", cpu_flags); #if defined(__arm__) || defined(__aarch64__) int has_arm = TestCpuFlag(kCpuHasARM); - printf("Has ARM %d\n", has_arm); + printf("Has ARM 0x%x\n", has_arm); int has_neon = TestCpuFlag(kCpuHasNEON); - printf("Has NEON %d\n", has_neon); + printf("Has NEON 0x%x\n", has_neon); #endif +#if defined(__riscv) && defined(__linux__) + int has_riscv = TestCpuFlag(kCpuHasRISCV); + printf("Has RISCV 0x%x\n", has_riscv); + int has_rvv = TestCpuFlag(kCpuHasRVV); + printf("Has RVV 0x%x\n", has_rvv); + int has_rvvzvfh = TestCpuFlag(kCpuHasRVVZVFH); + printf("Has RVVZVFH 0x%x\n", has_rvvzvfh); +#endif +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) int has_x86 = TestCpuFlag(kCpuHasX86); int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); @@ -45,39 +55,38 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ); - printf("Has X86 %d\n", has_x86); - printf("Has SSE2 %d\n", has_sse2); - printf("Has SSSE3 %d\n", has_ssse3); - printf("Has SSE41 %d\n", has_sse41); - printf("Has SSE42 %d\n", has_sse42); - printf("Has AVX %d\n", has_avx); - printf("Has AVX2 %d\n", has_avx2); - printf("Has ERMS %d\n", has_erms); - printf("Has FMA3 %d\n", has_fma3); - printf("Has F16C %d\n", has_f16c); - printf("Has GFNI %d\n", has_gfni); - printf("Has AVX512BW %d\n", has_avx512bw); - printf("Has AVX512VL %d\n", has_avx512vl); - printf("Has AVX512VNNI %d\n", has_avx512vnni); - printf("Has AVX512VBMI %d\n", has_avx512vbmi); - printf("Has AVX512VBMI2 %d\n", has_avx512vbmi2); - printf("Has AVX512VBITALG %d\n", has_avx512vbitalg); - printf("Has AVX512VPOPCNTDQ %d\n", has_avx512vpopcntdq); - + printf("Has X86 0x%x\n", has_x86); + printf("Has SSE2 0x%x\n", has_sse2); + printf("Has SSSE3 0x%x\n", has_ssse3); + printf("Has SSE41 0x%x\n", has_sse41); + printf("Has SSE42 0x%x\n", has_sse42); + printf("Has AVX 0x%x\n", has_avx); + printf("Has AVX2 0x%x\n", has_avx2); + printf("Has ERMS 0x%x\n", has_erms); + printf("Has FMA3 0x%x\n", has_fma3); + printf("Has F16C 0x%x\n", has_f16c); + printf("Has GFNI 0x%x\n", has_gfni); + printf("Has AVX512BW 0x%x\n", has_avx512bw); + printf("Has AVX512VL 0x%x\n", has_avx512vl); + printf("Has AVX512VNNI 0x%x\n", has_avx512vnni); + printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi); + printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2); + printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg); + printf("Has AVX512VPOPCNTDQ 0x%x\n", has_avx512vpopcntdq); +#endif #if defined(__mips__) int has_mips = TestCpuFlag(kCpuHasMIPS); - printf("Has MIPS %d\n", has_mips); + printf("Has MIPS 0x%x\n", has_mips); int has_msa = TestCpuFlag(kCpuHasMSA); - printf("Has MSA %d\n", has_msa); + printf("Has MSA 0x%x\n", has_msa); #endif - #if defined(__loongarch__) int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH); - printf("Has LOONGARCH %d\n", has_loongarch); + printf("Has LOONGARCH 0x%x\n", has_loongarch); int has_lsx = TestCpuFlag(kCpuHasLSX); - printf("Has LSX %d\n", has_lsx); + printf("Has LSX 0x%x\n", has_lsx); int has_lasx = TestCpuFlag(kCpuHasLASX); - printf("Has LASX %d\n", has_lasx); + printf("Has LASX 0x%x\n", has_lasx); #endif } @@ -104,27 +113,33 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) { #ifdef __i386__ printf("__i386__ %d\n", __i386__); #endif -#ifdef __mips - printf("__mips %d\n", __mips); -#endif -#ifdef __mips_isa_rev - printf("__mips_isa_rev %d\n", __mips_isa_rev); -#endif #ifdef __x86_64__ printf("__x86_64__ %d\n", __x86_64__); #endif +#ifdef _M_IX86 + printf("_M_IX86 %d\n", _M_IX86); +#endif +#ifdef _M_X64 + printf("_M_X64 %d\n", _M_X64); +#endif #ifdef _MSC_VER printf("_MSC_VER %d\n", _MSC_VER); #endif #ifdef __aarch64__ printf("__aarch64__ %d\n", __aarch64__); #endif -#ifdef __APPLE__ - printf("__APPLE__ %d\n", __APPLE__); -#endif #ifdef __arm__ printf("__arm__ %d\n", __arm__); #endif +#ifdef __riscv + printf("__riscv %d\n", __riscv); +#endif +#ifdef __riscv_vector + printf("__riscv_vector %d\n", __riscv_vector); +#endif +#ifdef __APPLE__ + printf("__APPLE__ %d\n", __APPLE__); +#endif #ifdef __clang__ printf("__clang__ %d\n", __clang__); #endif @@ -140,20 +155,11 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) { #ifdef __mips_msa printf("__mips_msa %d\n", __mips_msa); #endif -#ifdef __native_client__ - printf("__native_client__ %d\n", __native_client__); -#endif -#ifdef __pic__ - printf("__pic__ %d\n", __pic__); -#endif -#ifdef __pnacl__ - printf("__pnacl__ %d\n", __pnacl__); -#endif -#ifdef _M_IX86 - printf("_M_IX86 %d\n", _M_IX86); +#ifdef __mips + printf("__mips %d\n", __mips); #endif -#ifdef _M_X64 - printf("_M_X64 %d\n", _M_X64); +#ifdef __mips_isa_rev + printf("__mips_isa_rev %d\n", __mips_isa_rev); #endif #ifdef _MIPS_ARCH_LOONGSON3A printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A); @@ -164,6 +170,15 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) { #ifdef _WIN32 printf("_WIN32 %d\n", _WIN32); #endif +#ifdef __native_client__ + printf("__native_client__ %d\n", __native_client__); +#endif +#ifdef __pic__ + printf("__pic__ %d\n", __pic__); +#endif +#ifdef __pnacl__ + printf("__pnacl__ %d\n", __pnacl__); +#endif #ifdef GG_LONGLONG printf("GG_LONGLONG %d\n", GG_LONGLONG); #endif @@ -200,8 +215,9 @@ TEST_F(LibYUVBaseTest, TestCpuId) { cpu_info[0] = cpu_info[1]; // Reorder output cpu_info[1] = cpu_info[3]; cpu_info[3] = 0; - printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast(&cpu_info[0]), - cpu_info[0], cpu_info[1], cpu_info[2]); + printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n", + reinterpret_cast(&cpu_info[0]), cpu_info[0], cpu_info[1], + cpu_info[2]); EXPECT_EQ(12u, strlen(reinterpret_cast(&cpu_info[0]))); // CPU Family and Model @@ -264,6 +280,32 @@ TEST_F(LibYUVBaseTest, TestLinuxMipsMsa) { } } +TEST_F(LibYUVBaseTest, TestLinuxRVV) { + if (FileExists("../../unit_test/testdata/riscv64.txt")) { + printf("Note: testing to load \"../../unit_test/testdata/riscv64.txt\"\n"); + + EXPECT_EQ(0, RiscvCpuCaps("../../unit_test/testdata/riscv64.txt")); + EXPECT_EQ(kCpuHasRVV, + RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv.txt")); + EXPECT_EQ(kCpuHasRVV | kCpuHasRVVZVFH, + RiscvCpuCaps("../../unit_test/testdata/riscv64_rvv_zvfh.txt")); + } else { + printf( + "WARNING: unable to load " + "\"../../unit_test/testdata/riscv64.txt\"\n"); + } +#if defined(__linux__) && defined(__riscv) + if (FileExists("/proc/cpuinfo")) { + if (!(kCpuHasRVV & RiscvCpuCaps("/proc/cpuinfo"))) { + // This can happen on RVV emulator but /proc/cpuinfo is from host. + printf("WARNING: RVV build enabled but CPU does not have RVV\n"); + } + } else { + printf("WARNING: unable to load \"/proc/cpuinfo\"\n"); + } +#endif +} + // TODO(fbarchard): Fix clangcl test of cpuflags. #ifdef _MSC_VER TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) { diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc index 3a8c470b..ad97b87e 100644 --- a/files/unit_test/planar_test.cc +++ b/files/unit_test/planar_test.cc @@ -1638,29 +1638,29 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) { int i, j; // orig is tiled. Allocate enough memory for tiles. - int orig_width = (benchmark_width_ + 15) & ~15; - int orig_height = (benchmark_height_ + 15) & ~15; - int orig_plane_size = orig_width * orig_height; + int tile_width = (benchmark_width_ + 15) & ~15; + int tile_height = (benchmark_height_ + 15) & ~15; + int tile_plane_size = tile_width * tile_height; int y_plane_size = benchmark_width_ * benchmark_height_; - align_buffer_page_end(orig_y, orig_plane_size); + align_buffer_page_end(tile_y, tile_plane_size); align_buffer_page_end(dst_c, y_plane_size); align_buffer_page_end(dst_opt, y_plane_size); - MemRandomize(orig_y, orig_plane_size); + MemRandomize(tile_y, tile_plane_size); memset(dst_c, 0, y_plane_size); memset(dst_opt, 0, y_plane_size); // Disable all optimizations. MaskCpuFlags(disable_cpu_flags_); for (j = 0; j < benchmark_iterations_; j++) { - DetilePlane(orig_y, orig_width, dst_c, benchmark_width_, benchmark_width_, + DetilePlane(tile_y, tile_width, dst_c, benchmark_width_, benchmark_width_, benchmark_height_, 16); } // Enable optimizations. MaskCpuFlags(benchmark_cpu_info_); for (j = 0; j < benchmark_iterations_; j++) { - DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_, benchmark_width_, + DetilePlane(tile_y, tile_width, dst_opt, benchmark_width_, benchmark_width_, benchmark_height_, 16); } @@ -1668,7 +1668,46 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) { EXPECT_EQ(dst_c[i], dst_opt[i]); } - free_aligned_buffer_page_end(orig_y); + free_aligned_buffer_page_end(tile_y); + free_aligned_buffer_page_end(dst_c); + free_aligned_buffer_page_end(dst_opt); +} + +TEST_F(LibYUVPlanarTest, TestDetilePlane_16) { + int i, j; + + // orig is tiled. Allocate enough memory for tiles. + int tile_width = (benchmark_width_ + 15) & ~15; + int tile_height = (benchmark_height_ + 15) & ~15; + int tile_plane_size = tile_width * tile_height * 2; + int y_plane_size = benchmark_width_ * benchmark_height_ * 2; + align_buffer_page_end(tile_y, tile_plane_size); + align_buffer_page_end(dst_c, y_plane_size); + align_buffer_page_end(dst_opt, y_plane_size); + + MemRandomize(tile_y, tile_plane_size); + memset(dst_c, 0, y_plane_size); + memset(dst_opt, 0, y_plane_size); + + // Disable all optimizations. + MaskCpuFlags(disable_cpu_flags_); + for (j = 0; j < benchmark_iterations_; j++) { + DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_c, + benchmark_width_, benchmark_width_, benchmark_height_, 16); + } + + // Enable optimizations. + MaskCpuFlags(benchmark_cpu_info_); + for (j = 0; j < benchmark_iterations_; j++) { + DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_opt, + benchmark_width_, benchmark_width_, benchmark_height_, 16); + } + + for (i = 0; i < y_plane_size; ++i) { + EXPECT_EQ(dst_c[i], dst_opt[i]); + } + + free_aligned_buffer_page_end(tile_y); free_aligned_buffer_page_end(dst_c); free_aligned_buffer_page_end(dst_opt); } @@ -1678,33 +1717,33 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) { int i, j; // orig is tiled. Allocate enough memory for tiles. - int orig_width = (benchmark_width_ + 15) & ~15; - int orig_height = (benchmark_height_ + 15) & ~15; - int orig_plane_size = orig_width * orig_height; + int tile_width = (benchmark_width_ + 15) & ~15; + int tile_height = (benchmark_height_ + 15) & ~15; + int tile_plane_size = tile_width * tile_height; int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_; - align_buffer_page_end(orig_uv, orig_plane_size); - align_buffer_page_end(detiled_uv, orig_plane_size); + align_buffer_page_end(tile_uv, tile_plane_size); + align_buffer_page_end(detiled_uv, tile_plane_size); align_buffer_page_end(dst_u_two_stage, uv_plane_size); align_buffer_page_end(dst_u_opt, uv_plane_size); align_buffer_page_end(dst_v_two_stage, uv_plane_size); align_buffer_page_end(dst_v_opt, uv_plane_size); - MemRandomize(orig_uv, orig_plane_size); - memset(detiled_uv, 0, orig_plane_size); + MemRandomize(tile_uv, tile_plane_size); + memset(detiled_uv, 0, tile_plane_size); memset(dst_u_two_stage, 0, uv_plane_size); memset(dst_u_opt, 0, uv_plane_size); memset(dst_v_two_stage, 0, uv_plane_size); memset(dst_v_opt, 0, uv_plane_size); - DetileSplitUVPlane(orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, + DetileSplitUVPlane(tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16); // Benchmark 2 step conversion for comparison. for (j = 0; j < benchmark_iterations_; j++) { - DetilePlane(orig_uv, orig_width, detiled_uv, benchmark_width_, + DetilePlane(tile_uv, tile_width, detiled_uv, benchmark_width_, benchmark_width_, benchmark_height_, 16); - SplitUVPlane(detiled_uv, orig_width, dst_u_two_stage, + SplitUVPlane(detiled_uv, tile_width, dst_u_two_stage, (benchmark_width_ + 1) / 2, dst_v_two_stage, (benchmark_width_ + 1) / 2, (benchmark_width_ + 1) / 2, benchmark_height_); @@ -1715,7 +1754,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) { EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]); } - free_aligned_buffer_page_end(orig_uv); + free_aligned_buffer_page_end(tile_uv); free_aligned_buffer_page_end(detiled_uv); free_aligned_buffer_page_end(dst_u_two_stage); free_aligned_buffer_page_end(dst_u_opt); @@ -1727,17 +1766,17 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) { int i, j; // orig is tiled. Allocate enough memory for tiles. - int orig_width = (benchmark_width_ + 15) & ~15; - int orig_height = (benchmark_height_ + 15) & ~15; - int orig_plane_size = orig_width * orig_height; + int tile_width = (benchmark_width_ + 15) & ~15; + int tile_height = (benchmark_height_ + 15) & ~15; + int tile_plane_size = tile_width * tile_height; int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_; - align_buffer_page_end(orig_uv, orig_plane_size); + align_buffer_page_end(tile_uv, tile_plane_size); align_buffer_page_end(dst_u_c, uv_plane_size); align_buffer_page_end(dst_u_opt, uv_plane_size); align_buffer_page_end(dst_v_c, uv_plane_size); align_buffer_page_end(dst_v_opt, uv_plane_size); - MemRandomize(orig_uv, orig_plane_size); + MemRandomize(tile_uv, tile_plane_size); memset(dst_u_c, 0, uv_plane_size); memset(dst_u_opt, 0, uv_plane_size); memset(dst_v_c, 0, uv_plane_size); @@ -1746,7 +1785,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) { // Disable all optimizations. MaskCpuFlags(disable_cpu_flags_); - DetileSplitUVPlane(orig_uv, orig_width, dst_u_c, (benchmark_width_ + 1) / 2, + DetileSplitUVPlane(tile_uv, tile_width, dst_u_c, (benchmark_width_ + 1) / 2, dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16); @@ -1755,7 +1794,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) { for (j = 0; j < benchmark_iterations_; j++) { DetileSplitUVPlane( - orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt, + tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16); } @@ -1764,7 +1803,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) { EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); } - free_aligned_buffer_page_end(orig_uv); + free_aligned_buffer_page_end(tile_uv); free_aligned_buffer_page_end(dst_u_c); free_aligned_buffer_page_end(dst_u_opt); free_aligned_buffer_page_end(dst_v_c); @@ -3495,8 +3534,8 @@ TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16) // TODO(fbarchard): improve test for platforms and cpu detect #ifdef HAS_MERGEUVROW_16_AVX2 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + // Round count up to multiple of 8 + const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7; align_buffer_page_end(src_pixels_u, kPixels * 2); align_buffer_page_end(src_pixels_v, kPixels * 2); diff --git a/files/unit_test/rotate_argb_test.cc b/files/unit_test/rotate_argb_test.cc index 01ed69ca..74952c4e 100644 --- a/files/unit_test/rotate_argb_test.cc +++ b/files/unit_test/rotate_argb_test.cc @@ -225,4 +225,110 @@ TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) { free_aligned_buffer_page_end(src_argb); } +static void TestRotatePlane_16(int src_width, + int src_height, + int dst_width, + int dst_height, + libyuv::RotationMode mode, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (src_width < 1) { + src_width = 1; + } + if (src_height < 1) { + src_height = 1; + } + if (dst_width < 1) { + dst_width = 1; + } + if (dst_height < 1) { + dst_height = 1; + } + int src_stride = src_width; + int src_plane_size = src_stride * abs(src_height); + align_buffer_page_end_16(src, src_plane_size); + for (int i = 0; i < src_plane_size; ++i) { + src[i] = fastrand() & 0xff; + } + + int dst_stride = dst_width; + int dst_plane_size = dst_stride * dst_height; + align_buffer_page_end_16(dst_c, dst_plane_size); + align_buffer_page_end_16(dst_opt, dst_plane_size); + memset(dst_c, 2, dst_plane_size); + memset(dst_opt, 3, dst_plane_size); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + RotatePlane_16(src, src_stride, dst_c, dst_stride, src_width, src_height, + mode); + + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + for (int i = 0; i < benchmark_iterations; ++i) { + RotatePlane_16(src, src_stride, dst_opt, dst_stride, src_width, src_height, + mode); + } + + // Rotation should be exact. + for (int i = 0; i < dst_plane_size; ++i) { + EXPECT_EQ(dst_c[i], dst_opt[i]); + } + + free_aligned_buffer_page_end_16(dst_c); + free_aligned_buffer_page_end_16(dst_opt); + free_aligned_buffer_page_end_16(src); +} + +TEST_F(LibYUVRotateTest, RotatePlane0_16_Opt) { + TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, RotatePlane90_16_Opt) { + TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, RotatePlane180_16_Opt) { + TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, RotatePlane270_16_Opt) { + TestRotatePlane_16(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, RotatePlane0_16_Odd) { + TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, RotatePlane90_16_Odd) { + TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, RotatePlane180_16_Odd) { + TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, RotatePlane270_16_Odd) { + TestRotatePlane_16(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_); +} + } // namespace libyuv diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc index d3887414..abc08efa 100644 --- a/files/unit_test/rotate_test.cc +++ b/files/unit_test/rotate_test.cc @@ -14,6 +14,10 @@ #include "libyuv/cpu_id.h" #include "libyuv/rotate.h" +#ifdef ENABLE_ROW_TESTS +#include "libyuv/rotate_row.h" +#endif + namespace libyuv { #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) @@ -596,4 +600,363 @@ TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2) #undef TESTAPLANARTOP #undef TESTAPLANARTOPI +static void I010TestRotate(int src_width, + int src_height, + int dst_width, + int dst_height, + libyuv::RotationMode mode, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (src_width < 1) { + src_width = 1; + } + if (src_height == 0) { + src_height = 1; + } + if (dst_width < 1) { + dst_width = 1; + } + if (dst_height < 1) { + dst_height = 1; + } + int src_i010_y_size = src_width * Abs(src_height); + int src_i010_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2); + int src_i010_size = src_i010_y_size + src_i010_uv_size * 2; + align_buffer_page_end_16(src_i010, src_i010_size); + for (int i = 0; i < src_i010_size; ++i) { + src_i010[i] = fastrand() & 0x3ff; + } + + int dst_i010_y_size = dst_width * dst_height; + int dst_i010_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2); + int dst_i010_size = dst_i010_y_size + dst_i010_uv_size * 2; + align_buffer_page_end_16(dst_i010_c, dst_i010_size); + align_buffer_page_end_16(dst_i010_opt, dst_i010_size); + memset(dst_i010_c, 2, dst_i010_size * 2); + memset(dst_i010_opt, 3, dst_i010_size * 2); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + I010Rotate(src_i010, src_width, src_i010 + src_i010_y_size, + (src_width + 1) / 2, src_i010 + src_i010_y_size + src_i010_uv_size, + (src_width + 1) / 2, dst_i010_c, dst_width, + dst_i010_c + dst_i010_y_size, (dst_width + 1) / 2, + dst_i010_c + dst_i010_y_size + dst_i010_uv_size, + (dst_width + 1) / 2, src_width, src_height, mode); + + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + for (int i = 0; i < benchmark_iterations; ++i) { + I010Rotate( + src_i010, src_width, src_i010 + src_i010_y_size, (src_width + 1) / 2, + src_i010 + src_i010_y_size + src_i010_uv_size, (src_width + 1) / 2, + dst_i010_opt, dst_width, dst_i010_opt + dst_i010_y_size, + (dst_width + 1) / 2, dst_i010_opt + dst_i010_y_size + dst_i010_uv_size, + (dst_width + 1) / 2, src_width, src_height, mode); + } + + // Rotation should be exact. + for (int i = 0; i < dst_i010_size; ++i) { + EXPECT_EQ(dst_i010_c[i], dst_i010_opt[i]); + } + + free_aligned_buffer_page_end_16(dst_i010_c); + free_aligned_buffer_page_end_16(dst_i010_opt); + free_aligned_buffer_page_end_16(src_i010); +} + +TEST_F(LibYUVRotateTest, I010Rotate0_Opt) { + I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I010Rotate90_Opt) { + I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I010Rotate180_Opt) { + I010TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I010Rotate270_Opt) { + I010TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +static void I210TestRotate(int src_width, + int src_height, + int dst_width, + int dst_height, + libyuv::RotationMode mode, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (src_width < 1) { + src_width = 1; + } + if (src_height == 0) { + src_height = 1; + } + if (dst_width < 1) { + dst_width = 1; + } + if (dst_height < 1) { + dst_height = 1; + } + int src_i210_y_size = src_width * Abs(src_height); + int src_i210_uv_size = ((src_width + 1) / 2) * Abs(src_height); + int src_i210_size = src_i210_y_size + src_i210_uv_size * 2; + align_buffer_page_end_16(src_i210, src_i210_size); + for (int i = 0; i < src_i210_size; ++i) { + src_i210[i] = fastrand() & 0x3ff; + } + + int dst_i210_y_size = dst_width * dst_height; + int dst_i210_uv_size = ((dst_width + 1) / 2) * dst_height; + int dst_i210_size = dst_i210_y_size + dst_i210_uv_size * 2; + align_buffer_page_end_16(dst_i210_c, dst_i210_size); + align_buffer_page_end_16(dst_i210_opt, dst_i210_size); + memset(dst_i210_c, 2, dst_i210_size * 2); + memset(dst_i210_opt, 3, dst_i210_size * 2); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + I210Rotate(src_i210, src_width, src_i210 + src_i210_y_size, + (src_width + 1) / 2, src_i210 + src_i210_y_size + src_i210_uv_size, + (src_width + 1) / 2, dst_i210_c, dst_width, + dst_i210_c + dst_i210_y_size, (dst_width + 1) / 2, + dst_i210_c + dst_i210_y_size + dst_i210_uv_size, + (dst_width + 1) / 2, src_width, src_height, mode); + + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + for (int i = 0; i < benchmark_iterations; ++i) { + I210Rotate( + src_i210, src_width, src_i210 + src_i210_y_size, (src_width + 1) / 2, + src_i210 + src_i210_y_size + src_i210_uv_size, (src_width + 1) / 2, + dst_i210_opt, dst_width, dst_i210_opt + dst_i210_y_size, + (dst_width + 1) / 2, dst_i210_opt + dst_i210_y_size + dst_i210_uv_size, + (dst_width + 1) / 2, src_width, src_height, mode); + } + + // Rotation should be exact. + for (int i = 0; i < dst_i210_size; ++i) { + EXPECT_EQ(dst_i210_c[i], dst_i210_opt[i]); + } + + free_aligned_buffer_page_end_16(dst_i210_c); + free_aligned_buffer_page_end_16(dst_i210_opt); + free_aligned_buffer_page_end_16(src_i210); +} + +TEST_F(LibYUVRotateTest, I210Rotate0_Opt) { + I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I210Rotate90_Opt) { + I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I210Rotate180_Opt) { + I210TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I210Rotate270_Opt) { + I210TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +static void I410TestRotate(int src_width, + int src_height, + int dst_width, + int dst_height, + libyuv::RotationMode mode, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (src_width < 1) { + src_width = 1; + } + if (src_height == 0) { + src_height = 1; + } + if (dst_width < 1) { + dst_width = 1; + } + if (dst_height < 1) { + dst_height = 1; + } + int src_i410_y_size = src_width * Abs(src_height); + int src_i410_uv_size = src_width * Abs(src_height); + int src_i410_size = src_i410_y_size + src_i410_uv_size * 2; + align_buffer_page_end_16(src_i410, src_i410_size); + for (int i = 0; i < src_i410_size; ++i) { + src_i410[i] = fastrand() & 0x3ff; + } + + int dst_i410_y_size = dst_width * dst_height; + int dst_i410_uv_size = dst_width * dst_height; + int dst_i410_size = dst_i410_y_size + dst_i410_uv_size * 2; + align_buffer_page_end_16(dst_i410_c, dst_i410_size); + align_buffer_page_end_16(dst_i410_opt, dst_i410_size); + memset(dst_i410_c, 2, dst_i410_size * 2); + memset(dst_i410_opt, 3, dst_i410_size * 2); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width, + src_i410 + src_i410_y_size + src_i410_uv_size, src_width, + dst_i410_c, dst_width, dst_i410_c + dst_i410_y_size, dst_width, + dst_i410_c + dst_i410_y_size + dst_i410_uv_size, dst_width, + src_width, src_height, mode); + + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + for (int i = 0; i < benchmark_iterations; ++i) { + I410Rotate(src_i410, src_width, src_i410 + src_i410_y_size, src_width, + src_i410 + src_i410_y_size + src_i410_uv_size, src_width, + dst_i410_opt, dst_width, dst_i410_opt + dst_i410_y_size, + dst_width, dst_i410_opt + dst_i410_y_size + dst_i410_uv_size, + dst_width, src_width, src_height, mode); + } + + // Rotation should be exact. + for (int i = 0; i < dst_i410_size; ++i) { + EXPECT_EQ(dst_i410_c[i], dst_i410_opt[i]); + } + + free_aligned_buffer_page_end_16(dst_i410_c); + free_aligned_buffer_page_end_16(dst_i410_opt); + free_aligned_buffer_page_end_16(src_i410); +} + +TEST_F(LibYUVRotateTest, I410Rotate0_Opt) { + I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate0, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I410Rotate90_Opt) { + I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate90, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I410Rotate180_Opt) { + I410TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, + benchmark_height_, kRotate180, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +TEST_F(LibYUVRotateTest, I410Rotate270_Opt) { + I410TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, + benchmark_width_, kRotate270, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_); +} + +#if defined(ENABLE_ROW_TESTS) + +TEST_F(LibYUVRotateTest, Transpose4x4_Test) { + // dst width and height + const int width = 4; + const int height = 4; + int src_pixels[4][4]; + int dst_pixels_c[4][4]; + int dst_pixels_opt[4][4]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + src_pixels[i][j] = i * 10 + j; + } + } + memset(dst_pixels_c, 1, width * height * 4); + memset(dst_pixels_opt, 2, width * height * 4); + + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_c, width * 4, width); + + const int benchmark_iterations = + (benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) / + (4 * 4); + for (int i = 0; i < benchmark_iterations; ++i) { +#if defined(HAS_TRANSPOSE4X4_32_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else +#elif defined(HAS_TRANSPOSE4X4_32_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else +#endif + { + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } + } + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]); + EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]); + } + } +} + +TEST_F(LibYUVRotateTest, Transpose4x4_Opt) { + // dst width and height + const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3; + const int height = 4; + align_buffer_page_end(src_pixels, height * width * 4); + align_buffer_page_end(dst_pixels_c, width * height * 4); + align_buffer_page_end(dst_pixels_opt, width * height * 4); + + MemRandomize(src_pixels, height * width * 4); + memset(dst_pixels_c, 1, width * height * 4); + memset(dst_pixels_opt, 2, width * height * 4); + + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_c, width * 4, width); + + for (int i = 0; i < benchmark_iterations_; ++i) { +#if defined(HAS_TRANSPOSE4X4_32_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else +#elif defined(HAS_TRANSPOSE4X4_32_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else if (TestCpuFlag(kCpuHasSSE2)) { + Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else +#endif + { + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } + } + + for (int i = 0; i < width * height; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_c); + free_aligned_buffer_page_end(dst_pixels_opt); +} + +#endif // ENABLE_ROW_TESTS + } // namespace libyuv diff --git a/files/unit_test/scale_uv_test.cc b/files/unit_test/scale_uv_test.cc index 3d524bef..dab217c9 100644 --- a/files/unit_test/scale_uv_test.cc +++ b/files/unit_test/scale_uv_test.cc @@ -39,55 +39,35 @@ static int UVTestFilter(int src_width, return 0; } - int i, j; - const int b = 0; // 128 to test for padding/stride. - int64_t src_uv_plane_size = - (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 2LL; - int src_stride_uv = (b * 2 + Abs(src_width)) * 2; + int i; + int64_t src_uv_plane_size = Abs(src_width) * Abs(src_height) * 2LL; + int src_stride_uv = Abs(src_width) * 2; + int64_t dst_uv_plane_size = dst_width * dst_height * 2LL; + int dst_stride_uv = dst_width * 2; align_buffer_page_end(src_uv, src_uv_plane_size); - if (!src_uv) { - printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); - return 0; - } - MemRandomize(src_uv, src_uv_plane_size); - - int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL; - int dst_stride_uv = (b * 2 + dst_width) * 2; - align_buffer_page_end(dst_uv_c, dst_uv_plane_size); align_buffer_page_end(dst_uv_opt, dst_uv_plane_size); - if (!dst_uv_c || !dst_uv_opt) { + + if (!src_uv || !dst_uv_c || !dst_uv_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } + MemRandomize(src_uv, src_uv_plane_size); memset(dst_uv_c, 2, dst_uv_plane_size); - memset(dst_uv_opt, 3, dst_uv_plane_size); - - // Warm up both versions for consistent benchmarks. - MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); - MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); + memset(dst_uv_opt, 123, dst_uv_plane_size); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, + UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_c, dst_stride_uv, dst_width, dst_height, f); - c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); + UVScale(src_uv, src_stride_uv, src_width, src_height, dst_uv_opt, + dst_stride_uv, dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; @@ -95,18 +75,11 @@ static int UVTestFilter(int src_width, printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); - // C version may be a little off from the optimized. Order of - // operations may introduce rounding somewhere. So do a difference - // of the buffers and look to see that the max difference isn't - // over 2. int max_diff = 0; - for (i = b; i < (dst_height + b); ++i) { - for (j = b * 2; j < (dst_width + b) * 2; ++j) { - int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] - - dst_uv_opt[(i * dst_stride_uv) + j]); - if (abs_diff > max_diff) { - max_diff = abs_diff; - } + for (i = 0; i < dst_uv_plane_size; ++i) { + int abs_diff = Abs(dst_uv_c[i] - dst_uv_opt[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; } } @@ -121,28 +94,26 @@ static int UVTestFilter(int src_width, #define DX(x, nom, denom) static_cast((Abs(x) / nom) * nom) #define SX(x, nom, denom) static_cast((x / nom) * denom) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ +#define TEST_FACTOR1(name, filter, nom, denom) \ TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) { \ int diff = UVTestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ + EXPECT_EQ(0, diff); \ } #if defined(ENABLE_FULL_TESTS) -// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but -// filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, nom, denom) \ - TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, 3) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ - TEST_FACTOR1(name, Box, nom, denom, 3) +// Test a scale factor with all 4 filters. Expect exact for SIMD vs C. +#define TEST_FACTOR(name, nom, denom) \ + TEST_FACTOR1(name, None, nom, denom) \ + TEST_FACTOR1(name, Linear, nom, denom) \ + TEST_FACTOR1(name, Bilinear, nom, denom) \ + TEST_FACTOR1(name, Box, nom, denom) #else // Test a scale factor with Bilinear. -#define TEST_FACTOR(name, nom, denom) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) +#define TEST_FACTOR(name, nom, denom) TEST_FACTOR1(name, Bilinear, nom, denom) #endif TEST_FACTOR(2, 1, 2) diff --git a/files/unit_test/testdata/riscv64.txt b/files/unit_test/testdata/riscv64.txt new file mode 100644 index 00000000..fbb4200f --- /dev/null +++ b/files/unit_test/testdata/riscv64.txt @@ -0,0 +1,4 @@ +processor : 0 +hart : 1 +isa : rv64imac +mmu : sv48 \ No newline at end of file diff --git a/files/unit_test/testdata/riscv64_rvv.txt b/files/unit_test/testdata/riscv64_rvv.txt new file mode 100644 index 00000000..af1b3f36 --- /dev/null +++ b/files/unit_test/testdata/riscv64_rvv.txt @@ -0,0 +1,4 @@ +processor : 0 +hart : 1 +isa : rv64imafdcv +mmu : sv48 \ No newline at end of file diff --git a/files/unit_test/testdata/riscv64_rvv_zvfh.txt b/files/unit_test/testdata/riscv64_rvv_zvfh.txt new file mode 100644 index 00000000..c416c1af --- /dev/null +++ b/files/unit_test/testdata/riscv64_rvv_zvfh.txt @@ -0,0 +1,4 @@ +processor : 0 +hart : 1 +isa : rv64imafdcv_zfh_zvfh +mmu : sv48 \ No newline at end of file diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc index 61145a46..b66ebfab 100644 --- a/files/unit_test/unit_test.cc +++ b/files/unit_test/unit_test.cc @@ -88,6 +88,11 @@ int TestCpuEnv(int cpu_info) { cpu_info &= ~libyuv::kCpuHasLASX; } #endif +#if defined(__riscv) && defined(__linux__) + if (TestEnv("LIBYUV_DISABLE_RVV")) { + cpu_info &= ~libyuv::kCpuHasRVV; + } +#endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ defined(_M_IX86)) diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h index 0a8df4d2..99cc8d19 100644 --- a/files/unit_test/unit_test.h +++ b/files/unit_test/unit_test.h @@ -11,10 +11,10 @@ #ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT #define UNIT_TEST_UNIT_TEST_H_ +#include // For NULL #ifdef _WIN32 #include #else -#include #include #endif @@ -77,7 +77,18 @@ static inline bool SizeValid(int src_width, #define free_aligned_buffer_page_end(var) \ free(var##_mem); \ - var = 0 + var = NULL + +#define align_buffer_page_end_16(var, size) \ + uint8_t* var##_mem = \ + reinterpret_cast(malloc(((size)*2 + 4095 + 63) & ~4095)); \ + uint16_t* var = reinterpret_cast( \ + (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \ + ~63) + +#define free_aligned_buffer_page_end_16(var) \ + free(var##_mem); \ + var = NULL #ifdef WIN32 static inline double get_time() { diff --git a/files/util/cpuid.c b/files/util/cpuid.c index b618bb10..edc6a26e 100644 --- a/files/util/cpuid.c +++ b/files/util/cpuid.c @@ -21,8 +21,9 @@ using namespace libyuv; int main(int argc, const char* argv[]) { int cpu_flags = TestCpuFlag(-1); int has_arm = TestCpuFlag(kCpuHasARM); - int has_mips = TestCpuFlag(kCpuHasMIPS); + int has_riscv = TestCpuFlag(kCpuHasRISCV); int has_x86 = TestCpuFlag(kCpuHasX86); + int has_mips = TestCpuFlag(kCpuHasMIPS); int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH); (void)argc; (void)argv; @@ -62,24 +63,28 @@ int main(int argc, const char* argv[]) { model, model); } #endif - printf("Cpu Flags %x\n", cpu_flags); - printf("Has ARM %x\n", has_arm); - printf("Has MIPS %x\n", has_mips); - printf("Has X86 %x\n", has_x86); - printf("Has LOONGARCH %x\n", has_loongarch); + printf("Cpu Flags 0x%x\n", cpu_flags); if (has_arm) { int has_neon = TestCpuFlag(kCpuHasNEON); - printf("Has NEON %x\n", has_neon); + printf("Has ARM 0x%x\n", has_arm); + printf("Has NEON 0x%x\n", has_neon); + } + if (has_riscv) { + int has_rvv = TestCpuFlag(kCpuHasRVV); + printf("Has RISCV 0x%x\n", has_riscv); + printf("Has RVV 0x%x\n", has_rvv); } if (has_mips) { int has_msa = TestCpuFlag(kCpuHasMSA); - printf("Has MSA %x\n", has_msa); + printf("Has MIPS 0x%x\n", has_mips); + printf("Has MSA 0x%x\n", has_msa); } if (has_loongarch) { int has_lsx = TestCpuFlag(kCpuHasLSX); - printf("Has LSX %x\n", has_lsx); int has_lasx = TestCpuFlag(kCpuHasLASX); - printf("Has LASX %x\n", has_lasx); + printf("Has LOONGARCH 0x%x\n", has_loongarch); + printf("Has LSX 0x%x\n", has_lsx); + printf("Has LASX 0x%x\n", has_lasx); } if (has_x86) { int has_sse2 = TestCpuFlag(kCpuHasSSE2); @@ -99,23 +104,24 @@ int main(int argc, const char* argv[]) { int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ); - printf("Has SSE2 %x\n", has_sse2); - printf("Has SSSE3 %x\n", has_ssse3); - printf("Has SSE4.1 %x\n", has_sse41); - printf("Has SSE4.2 %x\n", has_sse42); - printf("Has AVX %x\n", has_avx); - printf("Has AVX2 %x\n", has_avx2); - printf("Has ERMS %x\n", has_erms); - printf("Has FMA3 %x\n", has_fma3); - printf("Has F16C %x\n", has_f16c); - printf("Has GFNI %x\n", has_gfni); - printf("Has AVX512BW %x\n", has_avx512bw); - printf("Has AVX512VL %x\n", has_avx512vl); - printf("Has AVX512VNNI %x\n", has_avx512vnni); - printf("Has AVX512VBMI %x\n", has_avx512vbmi); - printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2); - printf("Has AVX512VBITALG %x\n", has_avx512vbitalg); - printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq); + printf("Has X86 0x%x\n", has_x86); + printf("Has SSE2 0x%x\n", has_sse2); + printf("Has SSSE3 0x%x\n", has_ssse3); + printf("Has SSE4.1 0x%x\n", has_sse41); + printf("Has SSE4.2 0x%x\n", has_sse42); + printf("Has AVX 0x%x\n", has_avx); + printf("Has AVX2 0x%x\n", has_avx2); + printf("Has ERMS 0x%x\n", has_erms); + printf("Has FMA3 0x%x\n", has_fma3); + printf("Has F16C 0x%x\n", has_f16c); + printf("Has GFNI 0x%x\n", has_gfni); + printf("Has AVX512BW 0x%x\n", has_avx512bw); + printf("Has AVX512VL 0x%x\n", has_avx512vl); + printf("Has AVX512VNNI 0x%x\n", has_avx512vnni); + printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi); + printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2); + printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg); + printf("Has AVX512VPOPCNTDQ 0x%x\n", has_avx512vpopcntdq); } return 0; } diff --git a/files/util/yuvconstants.c b/files/util/yuvconstants.c index 037e0824..4e5185af 100644 --- a/files/util/yuvconstants.c +++ b/files/util/yuvconstants.c @@ -43,9 +43,10 @@ // #define BR (-VR * 128 + YB) int main(int argc, const char* argv[]) { - if (argc < 2) { - printf("yuvconstants Kr Kb\n"); - printf(" MC BT KR = 0.2126; KB = 0.0722\n"); + if (argc < 3) { + printf("yuvconstants [KR] [KB]\n"); + printf(" e.g. yuvconstants 0.2126 0.0722\n"); + printf(" MC BT KR KB\n"); printf(" 1 BT.709 KR = 0.2126; KB = 0.0722\n"); printf(" 4 FCC KR = 0.30; KB = 0.11\n"); printf(" 6 BT.601 KR = 0.299; KB = 0.114\n"); @@ -53,8 +54,8 @@ int main(int argc, const char* argv[]) { printf(" 9 BT.2020 KR = 0.2627; KB = 0.0593\n"); return -1; } - float kr = atof(argv[1]); - float kb = atof(argv[2]); + float kr = (float)atof(argv[1]); + float kb = (float)atof(argv[2]); float kg = 1 - kr - kb; float vr = 2 * (1 - kr); diff --git a/files/util/yuvconvert.cc b/files/util/yuvconvert.cc index 332699e3..93b52668 100644 --- a/files/util/yuvconvert.cc +++ b/files/util/yuvconvert.cc @@ -42,9 +42,9 @@ static __inline uint32_t Abs(int32_t v) { } // Parse PYUV format. ie name.1920x800_24Hz_P420.yuv -bool ExtractResolutionFromFilename(const char* name, - int* width_ptr, - int* height_ptr) { +static bool ExtractResolutionFromFilename(const char* name, + int* width_ptr, + int* height_ptr) { // Isolate the .width_height. section of the filename by searching for a // dot or underscore followed by a digit. for (int i = 0; name[i]; ++i) { @@ -59,7 +59,7 @@ bool ExtractResolutionFromFilename(const char* name, return false; } -void PrintHelp(const char* program) { +static void PrintHelp(const char* program) { printf("%s [-options] src_argb.raw dst_yuv.raw\n", program); printf( " -s .... specify source resolution. " @@ -78,7 +78,7 @@ void PrintHelp(const char* program) { exit(0); } -void ParseOptions(int argc, const char* argv[]) { +static void ParseOptions(int argc, const char* argv[]) { if (argc <= 1) { PrintHelp(argv[0]); } -- cgit v1.2.3