diff options
Diffstat (limited to 'third_party')
501 files changed, 35545 insertions, 11725 deletions
diff --git a/third_party/abseil-cpp/CMake/AbseilDll.cmake b/third_party/abseil-cpp/CMake/AbseilDll.cmake index 39f85f2ffd..8ee4120f69 100644 --- a/third_party/abseil-cpp/CMake/AbseilDll.cmake +++ b/third_party/abseil-cpp/CMake/AbseilDll.cmake @@ -1,4 +1,5 @@ include(CMakeParseArguments) +include(GNUInstallDirs) set(ABSL_INTERNAL_DLL_FILES "algorithm/algorithm.h" @@ -196,16 +197,27 @@ set(ABSL_INTERNAL_DLL_FILES "strings/cord.h" "strings/escaping.cc" "strings/escaping.h" + "strings/internal/charconv_bigint.cc" + "strings/internal/charconv_bigint.h" + "strings/internal/charconv_parse.cc" + "strings/internal/charconv_parse.h" "strings/internal/cord_internal.cc" "strings/internal/cord_internal.h" "strings/internal/cord_rep_flat.h" "strings/internal/cord_rep_ring.cc" "strings/internal/cord_rep_ring.h" "strings/internal/cord_rep_ring_reader.h" - "strings/internal/charconv_bigint.cc" - "strings/internal/charconv_bigint.h" - "strings/internal/charconv_parse.cc" - "strings/internal/charconv_parse.h" + "strings/internal/cordz_functions.cc" + "strings/internal/cordz_functions.h" + "strings/internal/cordz_handle.cc" + "strings/internal/cordz_handle.h" + "strings/internal/cordz_info.cc" + "strings/internal/cordz_info.h" + "strings/internal/cordz_sample_token.cc" + "strings/internal/cordz_sample_token.h" + "strings/internal/cordz_statistics.h" + "strings/internal/cordz_update_scope.h" + "strings/internal/cordz_update_tracker.h" "strings/internal/stl_type_traits.h" "strings/internal/string_constant.h" "strings/match.cc" @@ -500,7 +512,7 @@ function(absl_make_dll) abseil_dll PUBLIC "$<BUILD_INTERFACE:${ABSL_COMMON_INCLUDE_DIRS}>" - $<INSTALL_INTERFACE:${ABSL_INSTALL_INCLUDEDIR}> + $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}> ) target_compile_options( @@ -518,8 +530,8 @@ function(absl_make_dll) ${ABSL_CC_LIB_DEFINES} ) install(TARGETS abseil_dll EXPORT ${PROJECT_NAME}Targets - RUNTIME DESTINATION ${ABSL_INSTALL_BINDIR} - LIBRARY DESTINATION ${ABSL_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${ABSL_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) endfunction() diff --git a/third_party/abseil-cpp/CMake/AbseilHelpers.cmake b/third_party/abseil-cpp/CMake/AbseilHelpers.cmake index 8502c02c9a..6a64a2c788 100644 --- a/third_party/abseil-cpp/CMake/AbseilHelpers.cmake +++ b/third_party/abseil-cpp/CMake/AbseilHelpers.cmake @@ -17,7 +17,6 @@ include(CMakeParseArguments) include(AbseilConfigureCopts) include(AbseilDll) -include(AbseilInstallDirs) # The IDE folder for Abseil that will be used if Abseil is included in a CMake # project that sets @@ -142,7 +141,8 @@ function(absl_cc_library) endif() # Generate a pkg-config file for every library: - if((_build_type STREQUAL "static" OR _build_type STREQUAL "shared") AND ABSL_ENABLE_INSTALL) + if((_build_type STREQUAL "static" OR _build_type STREQUAL "shared") + AND ABSL_ENABLE_INSTALL) if(NOT ABSL_CC_LIB_TESTONLY) if(absl_VERSION) set(PC_VERSION "${absl_VERSION}") @@ -151,6 +151,10 @@ function(absl_cc_library) endif() foreach(dep ${ABSL_CC_LIB_DEPS}) if(${dep} MATCHES "^absl::(.*)") + # Join deps with commas. + if(PC_DEPS) + set(PC_DEPS "${PC_DEPS},") + endif() set(PC_DEPS "${PC_DEPS} absl_${CMAKE_MATCH_1} = ${PC_VERSION}") endif() endforeach() @@ -167,18 +171,18 @@ function(absl_cc_library) FILE(GENERATE OUTPUT "${CMAKE_BINARY_DIR}/lib/pkgconfig/absl_${_NAME}.pc" CONTENT "\ prefix=${CMAKE_INSTALL_PREFIX}\n\ exec_prefix=\${prefix}\n\ -libdir=\${prefix}/lib\n\ -includedir=\${prefix}/include\n\ +libdir=${CMAKE_INSTALL_FULL_LIBDIR}\n\ +includedir=${CMAKE_INSTALL_FULL_INCLUDEDIR}\n\ \n\ Name: absl_${_NAME}\n\ Description: Abseil ${_NAME} library\n\ URL: https://abseil.io/\n\ Version: ${PC_VERSION}\n\ -Requires.private:${PC_DEPS}\n\ +Requires:${PC_DEPS}\n\ Libs: -L\${libdir} $<JOIN:${ABSL_CC_LIB_LINKOPTS}, > $<$<NOT:$<BOOL:${ABSL_CC_LIB_IS_INTERFACE}>>:-labsl_${_NAME}>\n\ Cflags: -I\${includedir}${PC_CFLAGS}\n") INSTALL(FILES "${CMAKE_BINARY_DIR}/lib/pkgconfig/absl_${_NAME}.pc" - DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig") + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") endif() endif() @@ -235,7 +239,7 @@ Cflags: -I\${includedir}${PC_CFLAGS}\n") target_include_directories(${_NAME} PUBLIC "$<BUILD_INTERFACE:${ABSL_COMMON_INCLUDE_DIRS}>" - $<INSTALL_INTERFACE:${ABSL_INSTALL_INCLUDEDIR}> + $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}> ) target_compile_options(${_NAME} PRIVATE ${ABSL_CC_LIB_COPTS}) @@ -260,7 +264,6 @@ Cflags: -I\${includedir}${PC_CFLAGS}\n") if(ABSL_ENABLE_INSTALL) set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "absl_${_NAME}" - # TODO(b/173696973): Figure out how to set SOVERSION for LTS releases. SOVERSION 0 ) endif() @@ -270,7 +273,7 @@ Cflags: -I\${includedir}${PC_CFLAGS}\n") target_include_directories(${_NAME} INTERFACE "$<BUILD_INTERFACE:${ABSL_COMMON_INCLUDE_DIRS}>" - $<INSTALL_INTERFACE:${ABSL_INSTALL_INCLUDEDIR}> + $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}> ) if (_build_type STREQUAL "dll") @@ -290,9 +293,9 @@ Cflags: -I\${includedir}${PC_CFLAGS}\n") # installed abseil can't be tested. if(NOT ABSL_CC_LIB_TESTONLY AND ABSL_ENABLE_INSTALL) install(TARGETS ${_NAME} EXPORT ${PROJECT_NAME}Targets - RUNTIME DESTINATION ${ABSL_INSTALL_BINDIR} - LIBRARY DESTINATION ${ABSL_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${ABSL_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) endif() @@ -333,8 +336,8 @@ endfunction() # "awesome_test.cc" # DEPS # absl::awesome -# gmock -# gtest_main +# GTest::gmock +# GTest::gtest_main # ) function(absl_cc_test) if(NOT BUILD_TESTING) diff --git a/third_party/abseil-cpp/CMake/AbseilInstallDirs.cmake b/third_party/abseil-cpp/CMake/AbseilInstallDirs.cmake deleted file mode 100644 index 6fc914b60f..0000000000 --- a/third_party/abseil-cpp/CMake/AbseilInstallDirs.cmake +++ /dev/null @@ -1,20 +0,0 @@ -include(GNUInstallDirs) - -# absl_VERSION is only set if we are an LTS release being installed, in which -# case it may be into a system directory and so we need to make subdirectories -# for each installed version of Abseil. This mechanism is implemented in -# Abseil's internal Copybara (https://github.com/google/copybara) workflows and -# isn't visible in the CMake buildsystem itself. - -if(absl_VERSION) - set(ABSL_SUBDIR "${PROJECT_NAME}_${PROJECT_VERSION}") - set(ABSL_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}/${ABSL_SUBDIR}") - set(ABSL_INSTALL_CONFIGDIR "${CMAKE_INSTALL_LIBDIR}/cmake/${ABSL_SUBDIR}") - set(ABSL_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/${ABSL_SUBDIR}") - set(ABSL_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}/${ABSL_SUBDIR}") -else() - set(ABSL_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}") - set(ABSL_INSTALL_CONFIGDIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}") - set(ABSL_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}") - set(ABSL_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}") -endif() diff --git a/third_party/abseil-cpp/CMake/install_test_project/CMakeLists.txt b/third_party/abseil-cpp/CMake/install_test_project/CMakeLists.txt index 06b797e9ed..eebfe617a4 100644 --- a/third_party/abseil-cpp/CMake/install_test_project/CMakeLists.txt +++ b/third_party/abseil-cpp/CMake/install_test_project/CMakeLists.txt @@ -18,8 +18,6 @@ cmake_minimum_required(VERSION 3.5) project(absl_cmake_testing CXX) -set(CMAKE_CXX_STANDARD 11) - add_executable(simple simple.cc) find_package(absl REQUIRED) diff --git a/third_party/abseil-cpp/CMakeLists.txt b/third_party/abseil-cpp/CMakeLists.txt index e68810e3cf..42bcbe100b 100644 --- a/third_party/abseil-cpp/CMakeLists.txt +++ b/third_party/abseil-cpp/CMakeLists.txt @@ -41,11 +41,16 @@ if (POLICY CMP0077) cmake_policy(SET CMP0077 NEW) endif (POLICY CMP0077) +# Allow the user to specify the MSVC runtime +if (POLICY CMP0091) + cmake_policy(SET CMP0091 NEW) +endif (POLICY CMP0091) + # Set BUILD_TESTING to OFF by default. # This must come before the project() and include(CTest) lines. OPTION(BUILD_TESTING "Build tests" OFF) -project(absl CXX) +project(absl LANGUAGES CXX) include(CTest) # Output directory is correct by default for most build setups. However, when @@ -67,8 +72,8 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/absl/copts ) -include(AbseilInstallDirs) include(CMakePackageConfigHelpers) +include(GNUInstallDirs) include(AbseilDll) include(AbseilHelpers) @@ -97,9 +102,18 @@ endif() ## pthread find_package(Threads REQUIRED) +include(CMakeDependentOption) + option(ABSL_USE_EXTERNAL_GOOGLETEST "If ON, Abseil will assume that the targets for GoogleTest are already provided by the including project. This makes sense when Abseil is used with add_subproject." OFF) +cmake_dependent_option(ABSL_FIND_GOOGLETEST + "If ON, Abseil will use find_package(GTest) rather than assuming that GoogleTest is already provided by the including project." + ON + "ABSL_USE_EXTERNAL_GOOGLETEST" + OFF) + + option(ABSL_USE_GOOGLETEST_HEAD "If ON, abseil will download HEAD from GoogleTest at config time." OFF) @@ -111,7 +125,15 @@ set(ABSL_LOCAL_GOOGLETEST_DIR "/usr/src/googletest" CACHE PATH if(BUILD_TESTING) ## check targets - if (NOT ABSL_USE_EXTERNAL_GOOGLETEST) + if (ABSL_USE_EXTERNAL_GOOGLETEST) + if (ABSL_FIND_GOOGLETEST) + find_package(GTest REQUIRED) + else() + if (NOT TARGET gtest AND NOT TARGET GTest::gtest) + message(FATAL_ERROR "ABSL_USE_EXTERNAL_GOOGLETEST is ON and ABSL_FIND_GOOGLETEST is OFF, which means that the top-level project must build the Google Test project. However, the target gtest was not found.") + endif() + endif() + else() set(absl_gtest_build_dir ${CMAKE_BINARY_DIR}/googletest-build) if(ABSL_USE_GOOGLETEST_HEAD AND ABSL_GOOGLETEST_DOWNLOAD_URL) message(FATAL_ERROR "Do not set both ABSL_USE_GOOGLETEST_HEAD and ABSL_GOOGLETEST_DOWNLOAD_URL") @@ -129,14 +151,22 @@ if(BUILD_TESTING) include(CMake/Googletest/DownloadGTest.cmake) endif() - check_target(gtest) - check_target(gtest_main) - check_target(gmock) + if (NOT ABSL_FIND_GOOGLETEST) + # When Google Test is included directly rather than through find_package, the aliases are missing. + add_library(GTest::gtest_main ALIAS gtest_main) + add_library(GTest::gtest ALIAS gtest) + add_library(GTest::gmock ALIAS gmock) + endif() + + check_target(GTest::gtest) + check_target(GTest::gtest_main) + check_target(GTest::gmock) + check_target(GTest::gmock_main) list(APPEND ABSL_TEST_COMMON_LIBRARIES - gtest_main - gtest - gmock + GTest::gtest_main + GTest::gtest + GTest::gmock ${CMAKE_THREAD_LIBS_INIT} ) endif() @@ -144,7 +174,6 @@ endif() add_subdirectory(absl) if(ABSL_ENABLE_INSTALL) - message(FATAL_ERROR "Please do not install abseil") # absl:lts-remove-begin(system installation is supported for LTS releases) # We don't support system-wide installation list(APPEND SYSTEM_INSTALL_DIRS "/usr/local" "/usr" "/opt/" "/opt/local" "c:/Program Files/${PROJECT_NAME}") @@ -160,16 +189,16 @@ if(ABSL_ENABLE_INSTALL) # install as a subdirectory only install(EXPORT ${PROJECT_NAME}Targets NAMESPACE absl:: - DESTINATION "${ABSL_INSTALL_CONFIGDIR}" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" ) configure_package_config_file( CMake/abslConfig.cmake.in "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" - INSTALL_DESTINATION "${ABSL_INSTALL_CONFIGDIR}" + INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" ) install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" - DESTINATION "${ABSL_INSTALL_CONFIGDIR}" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" ) # Abseil only has a version in LTS releases. This mechanism is accomplished @@ -182,12 +211,12 @@ if(ABSL_ENABLE_INSTALL) ) install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" - DESTINATION ${ABSL_INSTALL_CONFIGDIR} + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" ) endif() # absl_VERSION install(DIRECTORY absl - DESTINATION ${ABSL_INSTALL_INCLUDEDIR} + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN "*.inc" PATTERN "*.h" diff --git a/third_party/abseil-cpp/absl/CMakeLists.txt b/third_party/abseil-cpp/absl/CMakeLists.txt index fbfa7822b5..a41e1eeb35 100644 --- a/third_party/abseil-cpp/absl/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/CMakeLists.txt @@ -16,6 +16,7 @@ add_subdirectory(base) add_subdirectory(algorithm) +add_subdirectory(cleanup) add_subdirectory(container) add_subdirectory(debugging) add_subdirectory(flags) diff --git a/third_party/abseil-cpp/absl/algorithm/CMakeLists.txt b/third_party/abseil-cpp/absl/algorithm/CMakeLists.txt index 56cd0fb85b..609d858946 100644 --- a/third_party/abseil-cpp/absl/algorithm/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/algorithm/CMakeLists.txt @@ -35,7 +35,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::algorithm - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -65,5 +65,5 @@ absl_cc_test( absl::core_headers absl::memory absl::span - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/algorithm/container.h b/third_party/abseil-cpp/absl/algorithm/container.h index 6398438f08..1652e7b055 100644 --- a/third_party/abseil-cpp/absl/algorithm/container.h +++ b/third_party/abseil-cpp/absl/algorithm/container.h @@ -905,11 +905,11 @@ void c_sort(C& c) { // Overload of c_sort() for performing a `comp` comparison other than the // default `operator<`. -template <typename C, typename Compare> -void c_sort(C& c, Compare&& comp) { +template <typename C, typename LessThan> +void c_sort(C& c, LessThan&& comp) { std::sort(container_algorithm_internal::c_begin(c), container_algorithm_internal::c_end(c), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_stable_sort() @@ -925,11 +925,11 @@ void c_stable_sort(C& c) { // Overload of c_stable_sort() for performing a `comp` comparison other than the // default `operator<`. -template <typename C, typename Compare> -void c_stable_sort(C& c, Compare&& comp) { +template <typename C, typename LessThan> +void c_stable_sort(C& c, LessThan&& comp) { std::stable_sort(container_algorithm_internal::c_begin(c), container_algorithm_internal::c_end(c), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_is_sorted() @@ -944,11 +944,11 @@ bool c_is_sorted(const C& c) { // c_is_sorted() overload for performing a `comp` comparison other than the // default `operator<`. -template <typename C, typename Compare> -bool c_is_sorted(const C& c, Compare&& comp) { +template <typename C, typename LessThan> +bool c_is_sorted(const C& c, LessThan&& comp) { return std::is_sorted(container_algorithm_internal::c_begin(c), container_algorithm_internal::c_end(c), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_partial_sort() @@ -966,14 +966,14 @@ void c_partial_sort( // Overload of c_partial_sort() for performing a `comp` comparison other than // the default `operator<`. -template <typename RandomAccessContainer, typename Compare> +template <typename RandomAccessContainer, typename LessThan> void c_partial_sort( RandomAccessContainer& sequence, container_algorithm_internal::ContainerIter<RandomAccessContainer> middle, - Compare&& comp) { + LessThan&& comp) { std::partial_sort(container_algorithm_internal::c_begin(sequence), middle, container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_partial_sort_copy() @@ -994,15 +994,15 @@ c_partial_sort_copy(const C& sequence, RandomAccessContainer& result) { // Overload of c_partial_sort_copy() for performing a `comp` comparison other // than the default `operator<`. -template <typename C, typename RandomAccessContainer, typename Compare> +template <typename C, typename RandomAccessContainer, typename LessThan> container_algorithm_internal::ContainerIter<RandomAccessContainer> c_partial_sort_copy(const C& sequence, RandomAccessContainer& result, - Compare&& comp) { + LessThan&& comp) { return std::partial_sort_copy(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), container_algorithm_internal::c_begin(result), container_algorithm_internal::c_end(result), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_is_sorted_until() @@ -1018,12 +1018,12 @@ container_algorithm_internal::ContainerIter<C> c_is_sorted_until(C& c) { // Overload of c_is_sorted_until() for performing a `comp` comparison other than // the default `operator<`. -template <typename C, typename Compare> +template <typename C, typename LessThan> container_algorithm_internal::ContainerIter<C> c_is_sorted_until( - C& c, Compare&& comp) { + C& c, LessThan&& comp) { return std::is_sorted_until(container_algorithm_internal::c_begin(c), container_algorithm_internal::c_end(c), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_nth_element() @@ -1043,14 +1043,14 @@ void c_nth_element( // Overload of c_nth_element() for performing a `comp` comparison other than // the default `operator<`. -template <typename RandomAccessContainer, typename Compare> +template <typename RandomAccessContainer, typename LessThan> void c_nth_element( RandomAccessContainer& sequence, container_algorithm_internal::ContainerIter<RandomAccessContainer> nth, - Compare&& comp) { + LessThan&& comp) { std::nth_element(container_algorithm_internal::c_begin(sequence), nth, container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } //------------------------------------------------------------------------------ @@ -1072,12 +1072,12 @@ container_algorithm_internal::ContainerIter<Sequence> c_lower_bound( // Overload of c_lower_bound() for performing a `comp` comparison other than // the default `operator<`. -template <typename Sequence, typename T, typename Compare> +template <typename Sequence, typename T, typename LessThan> container_algorithm_internal::ContainerIter<Sequence> c_lower_bound( - Sequence& sequence, T&& value, Compare&& comp) { + Sequence& sequence, T&& value, LessThan&& comp) { return std::lower_bound(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<T>(value), std::forward<Compare>(comp)); + std::forward<T>(value), std::forward<LessThan>(comp)); } // c_upper_bound() @@ -1095,12 +1095,12 @@ container_algorithm_internal::ContainerIter<Sequence> c_upper_bound( // Overload of c_upper_bound() for performing a `comp` comparison other than // the default `operator<`. -template <typename Sequence, typename T, typename Compare> +template <typename Sequence, typename T, typename LessThan> container_algorithm_internal::ContainerIter<Sequence> c_upper_bound( - Sequence& sequence, T&& value, Compare&& comp) { + Sequence& sequence, T&& value, LessThan&& comp) { return std::upper_bound(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<T>(value), std::forward<Compare>(comp)); + std::forward<T>(value), std::forward<LessThan>(comp)); } // c_equal_range() @@ -1118,12 +1118,12 @@ c_equal_range(Sequence& sequence, T&& value) { // Overload of c_equal_range() for performing a `comp` comparison other than // the default `operator<`. -template <typename Sequence, typename T, typename Compare> +template <typename Sequence, typename T, typename LessThan> container_algorithm_internal::ContainerIterPairType<Sequence, Sequence> -c_equal_range(Sequence& sequence, T&& value, Compare&& comp) { +c_equal_range(Sequence& sequence, T&& value, LessThan&& comp) { return std::equal_range(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<T>(value), std::forward<Compare>(comp)); + std::forward<T>(value), std::forward<LessThan>(comp)); } // c_binary_search() @@ -1140,12 +1140,12 @@ bool c_binary_search(Sequence&& sequence, T&& value) { // Overload of c_binary_search() for performing a `comp` comparison other than // the default `operator<`. -template <typename Sequence, typename T, typename Compare> -bool c_binary_search(Sequence&& sequence, T&& value, Compare&& comp) { +template <typename Sequence, typename T, typename LessThan> +bool c_binary_search(Sequence&& sequence, T&& value, LessThan&& comp) { return std::binary_search(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), std::forward<T>(value), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } //------------------------------------------------------------------------------ @@ -1166,14 +1166,14 @@ OutputIterator c_merge(const C1& c1, const C2& c2, OutputIterator result) { // Overload of c_merge() for performing a `comp` comparison other than // the default `operator<`. -template <typename C1, typename C2, typename OutputIterator, typename Compare> +template <typename C1, typename C2, typename OutputIterator, typename LessThan> OutputIterator c_merge(const C1& c1, const C2& c2, OutputIterator result, - Compare&& comp) { + LessThan&& comp) { return std::merge(container_algorithm_internal::c_begin(c1), container_algorithm_internal::c_end(c1), container_algorithm_internal::c_begin(c2), container_algorithm_internal::c_end(c2), result, - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_inplace_merge() @@ -1189,13 +1189,13 @@ void c_inplace_merge(C& c, // Overload of c_inplace_merge() for performing a merge using a `comp` other // than `operator<`. -template <typename C, typename Compare> +template <typename C, typename LessThan> void c_inplace_merge(C& c, container_algorithm_internal::ContainerIter<C> middle, - Compare&& comp) { + LessThan&& comp) { std::inplace_merge(container_algorithm_internal::c_begin(c), middle, container_algorithm_internal::c_end(c), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_includes() @@ -1213,13 +1213,13 @@ bool c_includes(const C1& c1, const C2& c2) { // Overload of c_includes() for performing a merge using a `comp` other than // `operator<`. -template <typename C1, typename C2, typename Compare> -bool c_includes(const C1& c1, const C2& c2, Compare&& comp) { +template <typename C1, typename C2, typename LessThan> +bool c_includes(const C1& c1, const C2& c2, LessThan&& comp) { return std::includes(container_algorithm_internal::c_begin(c1), container_algorithm_internal::c_end(c1), container_algorithm_internal::c_begin(c2), container_algorithm_internal::c_end(c2), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_set_union() @@ -1243,7 +1243,7 @@ OutputIterator c_set_union(const C1& c1, const C2& c2, OutputIterator output) { // Overload of c_set_union() for performing a merge using a `comp` other than // `operator<`. -template <typename C1, typename C2, typename OutputIterator, typename Compare, +template <typename C1, typename C2, typename OutputIterator, typename LessThan, typename = typename std::enable_if< !container_algorithm_internal::IsUnorderedContainer<C1>::value, void>::type, @@ -1251,12 +1251,12 @@ template <typename C1, typename C2, typename OutputIterator, typename Compare, !container_algorithm_internal::IsUnorderedContainer<C2>::value, void>::type> OutputIterator c_set_union(const C1& c1, const C2& c2, OutputIterator output, - Compare&& comp) { + LessThan&& comp) { return std::set_union(container_algorithm_internal::c_begin(c1), container_algorithm_internal::c_end(c1), container_algorithm_internal::c_begin(c2), container_algorithm_internal::c_end(c2), output, - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_set_intersection() @@ -1280,7 +1280,7 @@ OutputIterator c_set_intersection(const C1& c1, const C2& c2, // Overload of c_set_intersection() for performing a merge using a `comp` other // than `operator<`. -template <typename C1, typename C2, typename OutputIterator, typename Compare, +template <typename C1, typename C2, typename OutputIterator, typename LessThan, typename = typename std::enable_if< !container_algorithm_internal::IsUnorderedContainer<C1>::value, void>::type, @@ -1288,12 +1288,12 @@ template <typename C1, typename C2, typename OutputIterator, typename Compare, !container_algorithm_internal::IsUnorderedContainer<C2>::value, void>::type> OutputIterator c_set_intersection(const C1& c1, const C2& c2, - OutputIterator output, Compare&& comp) { + OutputIterator output, LessThan&& comp) { return std::set_intersection(container_algorithm_internal::c_begin(c1), container_algorithm_internal::c_end(c1), container_algorithm_internal::c_begin(c2), container_algorithm_internal::c_end(c2), output, - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_set_difference() @@ -1318,7 +1318,7 @@ OutputIterator c_set_difference(const C1& c1, const C2& c2, // Overload of c_set_difference() for performing a merge using a `comp` other // than `operator<`. -template <typename C1, typename C2, typename OutputIterator, typename Compare, +template <typename C1, typename C2, typename OutputIterator, typename LessThan, typename = typename std::enable_if< !container_algorithm_internal::IsUnorderedContainer<C1>::value, void>::type, @@ -1326,12 +1326,12 @@ template <typename C1, typename C2, typename OutputIterator, typename Compare, !container_algorithm_internal::IsUnorderedContainer<C2>::value, void>::type> OutputIterator c_set_difference(const C1& c1, const C2& c2, - OutputIterator output, Compare&& comp) { + OutputIterator output, LessThan&& comp) { return std::set_difference(container_algorithm_internal::c_begin(c1), container_algorithm_internal::c_end(c1), container_algorithm_internal::c_begin(c2), container_algorithm_internal::c_end(c2), output, - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_set_symmetric_difference() @@ -1357,7 +1357,7 @@ OutputIterator c_set_symmetric_difference(const C1& c1, const C2& c2, // Overload of c_set_symmetric_difference() for performing a merge using a // `comp` other than `operator<`. -template <typename C1, typename C2, typename OutputIterator, typename Compare, +template <typename C1, typename C2, typename OutputIterator, typename LessThan, typename = typename std::enable_if< !container_algorithm_internal::IsUnorderedContainer<C1>::value, void>::type, @@ -1366,13 +1366,13 @@ template <typename C1, typename C2, typename OutputIterator, typename Compare, void>::type> OutputIterator c_set_symmetric_difference(const C1& c1, const C2& c2, OutputIterator output, - Compare&& comp) { + LessThan&& comp) { return std::set_symmetric_difference( container_algorithm_internal::c_begin(c1), container_algorithm_internal::c_end(c1), container_algorithm_internal::c_begin(c2), container_algorithm_internal::c_end(c2), output, - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } //------------------------------------------------------------------------------ @@ -1391,11 +1391,11 @@ void c_push_heap(RandomAccessContainer& sequence) { // Overload of c_push_heap() for performing a push operation on a heap using a // `comp` other than `operator<`. -template <typename RandomAccessContainer, typename Compare> -void c_push_heap(RandomAccessContainer& sequence, Compare&& comp) { +template <typename RandomAccessContainer, typename LessThan> +void c_push_heap(RandomAccessContainer& sequence, LessThan&& comp) { std::push_heap(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_pop_heap() @@ -1410,11 +1410,11 @@ void c_pop_heap(RandomAccessContainer& sequence) { // Overload of c_pop_heap() for performing a pop operation on a heap using a // `comp` other than `operator<`. -template <typename RandomAccessContainer, typename Compare> -void c_pop_heap(RandomAccessContainer& sequence, Compare&& comp) { +template <typename RandomAccessContainer, typename LessThan> +void c_pop_heap(RandomAccessContainer& sequence, LessThan&& comp) { std::pop_heap(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_make_heap() @@ -1429,11 +1429,11 @@ void c_make_heap(RandomAccessContainer& sequence) { // Overload of c_make_heap() for performing heap comparisons using a // `comp` other than `operator<` -template <typename RandomAccessContainer, typename Compare> -void c_make_heap(RandomAccessContainer& sequence, Compare&& comp) { +template <typename RandomAccessContainer, typename LessThan> +void c_make_heap(RandomAccessContainer& sequence, LessThan&& comp) { std::make_heap(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_sort_heap() @@ -1448,11 +1448,11 @@ void c_sort_heap(RandomAccessContainer& sequence) { // Overload of c_sort_heap() for performing heap comparisons using a // `comp` other than `operator<` -template <typename RandomAccessContainer, typename Compare> -void c_sort_heap(RandomAccessContainer& sequence, Compare&& comp) { +template <typename RandomAccessContainer, typename LessThan> +void c_sort_heap(RandomAccessContainer& sequence, LessThan&& comp) { std::sort_heap(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_is_heap() @@ -1467,11 +1467,11 @@ bool c_is_heap(const RandomAccessContainer& sequence) { // Overload of c_is_heap() for performing heap comparisons using a // `comp` other than `operator<` -template <typename RandomAccessContainer, typename Compare> -bool c_is_heap(const RandomAccessContainer& sequence, Compare&& comp) { +template <typename RandomAccessContainer, typename LessThan> +bool c_is_heap(const RandomAccessContainer& sequence, LessThan&& comp) { return std::is_heap(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_is_heap_until() @@ -1487,12 +1487,12 @@ c_is_heap_until(RandomAccessContainer& sequence) { // Overload of c_is_heap_until() for performing heap comparisons using a // `comp` other than `operator<` -template <typename RandomAccessContainer, typename Compare> +template <typename RandomAccessContainer, typename LessThan> container_algorithm_internal::ContainerIter<RandomAccessContainer> -c_is_heap_until(RandomAccessContainer& sequence, Compare&& comp) { +c_is_heap_until(RandomAccessContainer& sequence, LessThan&& comp) { return std::is_heap_until(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } //------------------------------------------------------------------------------ @@ -1513,12 +1513,12 @@ container_algorithm_internal::ContainerIter<Sequence> c_min_element( // Overload of c_min_element() for performing a `comp` comparison other than // `operator<`. -template <typename Sequence, typename Compare> +template <typename Sequence, typename LessThan> container_algorithm_internal::ContainerIter<Sequence> c_min_element( - Sequence& sequence, Compare&& comp) { + Sequence& sequence, LessThan&& comp) { return std::min_element(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_max_element() @@ -1535,12 +1535,12 @@ container_algorithm_internal::ContainerIter<Sequence> c_max_element( // Overload of c_max_element() for performing a `comp` comparison other than // `operator<`. -template <typename Sequence, typename Compare> +template <typename Sequence, typename LessThan> container_algorithm_internal::ContainerIter<Sequence> c_max_element( - Sequence& sequence, Compare&& comp) { + Sequence& sequence, LessThan&& comp) { return std::max_element(container_algorithm_internal::c_begin(sequence), container_algorithm_internal::c_end(sequence), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_minmax_element() @@ -1558,12 +1558,12 @@ c_minmax_element(C& c) { // Overload of c_minmax_element() for performing `comp` comparisons other than // `operator<`. -template <typename C, typename Compare> +template <typename C, typename LessThan> container_algorithm_internal::ContainerIterPairType<C, C> -c_minmax_element(C& c, Compare&& comp) { +c_minmax_element(C& c, LessThan&& comp) { return std::minmax_element(container_algorithm_internal::c_begin(c), container_algorithm_internal::c_end(c), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } //------------------------------------------------------------------------------ @@ -1588,15 +1588,15 @@ bool c_lexicographical_compare(Sequence1&& sequence1, Sequence2&& sequence2) { // Overload of c_lexicographical_compare() for performing a lexicographical // comparison using a `comp` operator instead of `operator<`. -template <typename Sequence1, typename Sequence2, typename Compare> +template <typename Sequence1, typename Sequence2, typename LessThan> bool c_lexicographical_compare(Sequence1&& sequence1, Sequence2&& sequence2, - Compare&& comp) { + LessThan&& comp) { return std::lexicographical_compare( container_algorithm_internal::c_begin(sequence1), container_algorithm_internal::c_end(sequence1), container_algorithm_internal::c_begin(sequence2), container_algorithm_internal::c_end(sequence2), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_next_permutation() @@ -1612,11 +1612,11 @@ bool c_next_permutation(C& c) { // Overload of c_next_permutation() for performing a lexicographical // comparison using a `comp` operator instead of `operator<`. -template <typename C, typename Compare> -bool c_next_permutation(C& c, Compare&& comp) { +template <typename C, typename LessThan> +bool c_next_permutation(C& c, LessThan&& comp) { return std::next_permutation(container_algorithm_internal::c_begin(c), container_algorithm_internal::c_end(c), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } // c_prev_permutation() @@ -1632,11 +1632,11 @@ bool c_prev_permutation(C& c) { // Overload of c_prev_permutation() for performing a lexicographical // comparison using a `comp` operator instead of `operator<`. -template <typename C, typename Compare> -bool c_prev_permutation(C& c, Compare&& comp) { +template <typename C, typename LessThan> +bool c_prev_permutation(C& c, LessThan&& comp) { return std::prev_permutation(container_algorithm_internal::c_begin(c), container_algorithm_internal::c_end(c), - std::forward<Compare>(comp)); + std::forward<LessThan>(comp)); } //------------------------------------------------------------------------------ diff --git a/third_party/abseil-cpp/absl/base/CMakeLists.txt b/third_party/abseil-cpp/absl/base/CMakeLists.txt index 981b8cc008..7d56aa1346 100644 --- a/third_party/abseil-cpp/absl/base/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/base/CMakeLists.txt @@ -230,7 +230,7 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS absl::config - gtest + GTest::gtest TESTONLY ) @@ -259,7 +259,7 @@ absl_cc_library( absl::meta absl::strings absl::utility - gtest + GTest::gtest TESTONLY ) @@ -273,7 +273,7 @@ absl_cc_test( DEPS absl::exception_safety_testing absl::memory - gtest_main + GTest::gtest_main ) absl_cc_library( @@ -300,8 +300,8 @@ absl_cc_test( absl::atomic_hook_test_helper absl::atomic_hook absl::core_headers - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -314,7 +314,7 @@ absl_cc_test( DEPS absl::base absl::core_headers - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -327,8 +327,8 @@ absl_cc_test( DEPS absl::errno_saver absl::strerror - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -342,7 +342,7 @@ absl_cc_test( absl::base absl::config absl::throw_delegate - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -357,7 +357,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::base_internal - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -371,8 +371,8 @@ absl_cc_test( absl::base_internal absl::memory absl::strings - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_library( @@ -388,7 +388,7 @@ absl_cc_library( absl::base_internal absl::core_headers absl::synchronization - gtest + GTest::gtest TESTONLY ) @@ -406,7 +406,7 @@ absl_cc_test( absl::config absl::core_headers absl::synchronization - gtest_main + GTest::gtest_main ) absl_cc_library( @@ -435,7 +435,7 @@ absl_cc_test( absl::base absl::config absl::endian - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -448,7 +448,7 @@ absl_cc_test( DEPS absl::config absl::synchronization - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -462,7 +462,7 @@ absl_cc_test( absl::base absl::core_headers absl::synchronization - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -475,7 +475,7 @@ absl_cc_test( DEPS absl::raw_logging_internal absl::strings - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -488,7 +488,7 @@ absl_cc_test( DEPS absl::base absl::synchronization - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -516,7 +516,7 @@ absl_cc_test( absl::core_headers absl::synchronization Threads::Threads - gtest_main + GTest::gtest_main ) absl_cc_library( @@ -543,7 +543,7 @@ absl_cc_test( DEPS absl::exponential_biased absl::strings - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -570,7 +570,7 @@ absl_cc_test( DEPS absl::core_headers absl::periodic_sampler - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -596,7 +596,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::scoped_set_env - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -620,8 +620,8 @@ absl_cc_test( absl::flags_marshalling absl::log_severity absl::strings - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_library( @@ -651,8 +651,8 @@ absl_cc_test( DEPS absl::strerror absl::strings - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_library( @@ -677,7 +677,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::fast_type_id - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -690,5 +690,5 @@ absl_cc_test( DEPS absl::core_headers absl::optional - gtest_main + GTest::gtest_main ) diff --git a/third_party/abseil-cpp/absl/base/attributes.h b/third_party/abseil-cpp/absl/base/attributes.h index cf2cb5501e..52139556f2 100644 --- a/third_party/abseil-cpp/absl/base/attributes.h +++ b/third_party/abseil-cpp/absl/base/attributes.h @@ -131,14 +131,14 @@ // ABSL_ATTRIBUTE_WEAK // // Tags a function as weak for the purposes of compilation and linking. -// Weak attributes currently do not work properly in LLVM's Windows backend, -// so disable them there. See https://bugs.llvm.org/show_bug.cgi?id=37598 +// Weak attributes did not work properly in LLVM's Windows backend before +// 9.0.0, so disable them there. See https://bugs.llvm.org/show_bug.cgi?id=37598 // for further information. // The MinGW compiler doesn't complain about the weak attribute until the link // step, presumably because Windows doesn't use ELF binaries. #if (ABSL_HAVE_ATTRIBUTE(weak) || \ (defined(__GNUC__) && !defined(__clang__))) && \ - !(defined(__llvm__) && defined(_WIN32)) && !defined(__MINGW32__) + (!defined(_WIN32) || __clang_major__ < 9) && !defined(__MINGW32__) #undef ABSL_ATTRIBUTE_WEAK #define ABSL_ATTRIBUTE_WEAK __attribute__((weak)) #define ABSL_HAVE_ATTRIBUTE_WEAK 1 @@ -281,10 +281,7 @@ // ABSL_ATTRIBUTE_RETURNS_NONNULL // // Tells the compiler that a particular function never returns a null pointer. -#if ABSL_HAVE_ATTRIBUTE(returns_nonnull) || \ - (defined(__GNUC__) && \ - (__GNUC__ > 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)) && \ - !defined(__clang__)) +#if ABSL_HAVE_ATTRIBUTE(returns_nonnull) #define ABSL_ATTRIBUTE_RETURNS_NONNULL __attribute__((returns_nonnull)) #else #define ABSL_ATTRIBUTE_RETURNS_NONNULL @@ -524,6 +521,13 @@ // ABSL_ATTRIBUTE_UNUSED // // Prevents the compiler from complaining about variables that appear unused. +// +// For code or headers that are assured to only build with C++17 and up, prefer +// just using the standard '[[maybe_unused]]' directly over this macro. +// +// Due to differences in positioning requirements between the old, compiler +// specific __attribute__ syntax and the now standard [[maybe_unused]], this +// macro does not attempt to take advantage of '[[maybe_unused]]'. #if ABSL_HAVE_ATTRIBUTE(unused) || (defined(__GNUC__) && !defined(__clang__)) #undef ABSL_ATTRIBUTE_UNUSED #define ABSL_ATTRIBUTE_UNUSED __attribute__((__unused__)) @@ -595,31 +599,24 @@ // case 42: // ... // -// Notes: when compiled with clang in C++11 mode, the ABSL_FALLTHROUGH_INTENDED -// macro is expanded to the [[clang::fallthrough]] attribute, which is analysed -// when performing switch labels fall-through diagnostic -// (`-Wimplicit-fallthrough`). See clang documentation on language extensions -// for details: +// Notes: When supported, GCC and Clang can issue a warning on switch labels +// with unannotated fallthrough using the warning `-Wimplicit-fallthrough`. See +// clang documentation on language extensions for details: // https://clang.llvm.org/docs/AttributeReference.html#fallthrough-clang-fallthrough // -// When used with unsupported compilers, the ABSL_FALLTHROUGH_INTENDED macro -// has no effect on diagnostics. In any case this macro has no effect on runtime +// When used with unsupported compilers, the ABSL_FALLTHROUGH_INTENDED macro has +// no effect on diagnostics. In any case this macro has no effect on runtime // behavior and performance of code. #ifdef ABSL_FALLTHROUGH_INTENDED #error "ABSL_FALLTHROUGH_INTENDED should not be defined." -#endif - -// TODO(zhangxy): Use c++17 standard [[fallthrough]] macro, when supported. -#if defined(__clang__) && defined(__has_warning) -#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough") +#elif ABSL_HAVE_CPP_ATTRIBUTE(fallthrough) +#define ABSL_FALLTHROUGH_INTENDED [[fallthrough]] +#elif ABSL_HAVE_CPP_ATTRIBUTE(clang::fallthrough) #define ABSL_FALLTHROUGH_INTENDED [[clang::fallthrough]] -#endif -#elif defined(__GNUC__) && __GNUC__ >= 7 +#elif ABSL_HAVE_CPP_ATTRIBUTE(gnu::fallthrough) #define ABSL_FALLTHROUGH_INTENDED [[gnu::fallthrough]] -#endif - -#ifndef ABSL_FALLTHROUGH_INTENDED +#else #define ABSL_FALLTHROUGH_INTENDED \ do { \ } while (0) @@ -699,4 +696,26 @@ #define ABSL_ATTRIBUTE_PURE_FUNCTION #endif +// ABSL_ATTRIBUTE_LIFETIME_BOUND indicates that a resource owned by a function +// parameter or implicit object parameter is retained by the return value of the +// annotated function (or, for a parameter of a constructor, in the value of the +// constructed object). This attribute causes warnings to be produced if a +// temporary object does not live long enough. +// +// When applied to a reference parameter, the referenced object is assumed to be +// retained by the return value of the function. When applied to a non-reference +// parameter (for example, a pointer or a class type), all temporaries +// referenced by the parameter are assumed to be retained by the return value of +// the function. +// +// See also the upstream documentation: +// https://clang.llvm.org/docs/AttributeReference.html#lifetimebound +#if ABSL_HAVE_CPP_ATTRIBUTE(clang::lifetimebound) +#define ABSL_ATTRIBUTE_LIFETIME_BOUND [[clang::lifetimebound]] +#elif ABSL_HAVE_ATTRIBUTE(lifetimebound) +#define ABSL_ATTRIBUTE_LIFETIME_BOUND __attribute__((lifetimebound)) +#else +#define ABSL_ATTRIBUTE_LIFETIME_BOUND +#endif + #endif // ABSL_BASE_ATTRIBUTES_H_ diff --git a/third_party/abseil-cpp/absl/base/config.h b/third_party/abseil-cpp/absl/base/config.h index 95449969e7..0524196d56 100644 --- a/third_party/abseil-cpp/absl/base/config.h +++ b/third_party/abseil-cpp/absl/base/config.h @@ -166,6 +166,22 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || #define ABSL_HAVE_FEATURE(f) 0 #endif +// Portable check for GCC minimum version: +// https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html +#if defined(__GNUC__) && defined(__GNUC_MINOR__) +#define ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(x, y) \ + (__GNUC__ > (x) || __GNUC__ == (x) && __GNUC_MINOR__ >= (y)) +#else +#define ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(x, y) 0 +#endif + +#if defined(__clang__) && defined(__clang_major__) && defined(__clang_minor__) +#define ABSL_INTERNAL_HAVE_MIN_CLANG_VERSION(x, y) \ + (__clang_major__ > (x) || __clang_major__ == (x) && __clang_minor__ >= (y)) +#else +#define ABSL_INTERNAL_HAVE_MIN_CLANG_VERSION(x, y) 0 +#endif + // ABSL_HAVE_TLS is defined to 1 when __thread should be supported. // We assume __thread is supported on Linux when compiled with Clang or compiled // against libstdc++ with _GLIBCXX_HAVE_TLS defined. @@ -183,10 +199,9 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || // gcc >= 4.8.1 using libstdc++, and Visual Studio. #ifdef ABSL_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE #error ABSL_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE cannot be directly set -#elif defined(_LIBCPP_VERSION) || \ - (!defined(__clang__) && defined(__GNUC__) && defined(__GLIBCXX__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) || \ - defined(_MSC_VER) +#elif defined(_LIBCPP_VERSION) || defined(_MSC_VER) || \ + (!defined(__clang__) && defined(__GLIBCXX__) && \ + ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(4, 8)) #define ABSL_HAVE_STD_IS_TRIVIALLY_DESTRUCTIBLE 1 #endif @@ -205,10 +220,9 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || #error ABSL_HAVE_STD_IS_TRIVIALLY_CONSTRUCTIBLE cannot be directly set #elif defined(ABSL_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE) #error ABSL_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE cannot directly set -#elif (defined(__clang__) && defined(_LIBCPP_VERSION)) || \ - (!defined(__clang__) && defined(__GNUC__) && \ - (__GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ >= 4)) && \ - (defined(_LIBCPP_VERSION) || defined(__GLIBCXX__))) || \ +#elif (defined(__clang__) && defined(_LIBCPP_VERSION)) || \ + (!defined(__clang__) && ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(7, 4) && \ + (defined(_LIBCPP_VERSION) || defined(__GLIBCXX__))) || \ (defined(_MSC_VER) && !defined(__NVCC__)) #define ABSL_HAVE_STD_IS_TRIVIALLY_CONSTRUCTIBLE 1 #define ABSL_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE 1 @@ -222,7 +236,7 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || #if ABSL_INTERNAL_HAS_KEYWORD(__builtin_LINE) && \ ABSL_INTERNAL_HAS_KEYWORD(__builtin_FILE) #define ABSL_HAVE_SOURCE_LOCATION_CURRENT 1 -#elif defined(__GNUC__) && __GNUC__ >= 5 +#elif ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(5, 0) #define ABSL_HAVE_SOURCE_LOCATION_CURRENT 1 #endif #endif @@ -319,25 +333,21 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || // For further details, consult the compiler's documentation. #ifdef ABSL_HAVE_EXCEPTIONS #error ABSL_HAVE_EXCEPTIONS cannot be directly set. - -#elif defined(__clang__) - -#if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 6) +#elif ABSL_INTERNAL_HAVE_MIN_CLANG_VERSION(3, 6) // Clang >= 3.6 #if ABSL_HAVE_FEATURE(cxx_exceptions) #define ABSL_HAVE_EXCEPTIONS 1 #endif // ABSL_HAVE_FEATURE(cxx_exceptions) -#else +#elif defined(__clang__) // Clang < 3.6 // http://releases.llvm.org/3.6.0/tools/clang/docs/ReleaseNotes.html#the-exceptions-macro #if defined(__EXCEPTIONS) && ABSL_HAVE_FEATURE(cxx_exceptions) #define ABSL_HAVE_EXCEPTIONS 1 #endif // defined(__EXCEPTIONS) && ABSL_HAVE_FEATURE(cxx_exceptions) -#endif // __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 6) - // Handle remaining special cases and default to exceptions being supported. -#elif !(defined(__GNUC__) && (__GNUC__ < 5) && !defined(__EXCEPTIONS)) && \ - !(defined(__GNUC__) && (__GNUC__ >= 5) && !defined(__cpp_exceptions)) && \ +#elif !(defined(__GNUC__) && (__GNUC__ < 5) && !defined(__EXCEPTIONS)) && \ + !(ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(5, 0) && \ + !defined(__cpp_exceptions)) && \ !(defined(_MSC_VER) && !defined(_CPPUNWIND)) #define ABSL_HAVE_EXCEPTIONS 1 #endif @@ -690,10 +700,6 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || // a compiler instrumentation module and a run-time library. #ifdef ABSL_HAVE_MEMORY_SANITIZER #error "ABSL_HAVE_MEMORY_SANITIZER cannot be directly set." -#elif defined(MEMORY_SANITIZER) -// The MEMORY_SANITIZER macro is deprecated but we will continue to honor it -// for now. -#define ABSL_HAVE_MEMORY_SANITIZER 1 #elif defined(__SANITIZE_MEMORY__) #define ABSL_HAVE_MEMORY_SANITIZER 1 #elif !defined(__native_client__) && ABSL_HAVE_FEATURE(memory_sanitizer) @@ -705,10 +711,6 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || // ThreadSanitizer (TSan) is a fast data race detector. #ifdef ABSL_HAVE_THREAD_SANITIZER #error "ABSL_HAVE_THREAD_SANITIZER cannot be directly set." -#elif defined(THREAD_SANITIZER) -// The THREAD_SANITIZER macro is deprecated but we will continue to honor it -// for now. -#define ABSL_HAVE_THREAD_SANITIZER 1 #elif defined(__SANITIZE_THREAD__) #define ABSL_HAVE_THREAD_SANITIZER 1 #elif ABSL_HAVE_FEATURE(thread_sanitizer) @@ -720,10 +722,6 @@ static_assert(ABSL_INTERNAL_INLINE_NAMESPACE_STR[0] != 'h' || // AddressSanitizer (ASan) is a fast memory error detector. #ifdef ABSL_HAVE_ADDRESS_SANITIZER #error "ABSL_HAVE_ADDRESS_SANITIZER cannot be directly set." -#elif defined(ADDRESS_SANITIZER) -// The ADDRESS_SANITIZER macro is deprecated but we will continue to honor it -// for now. -#define ABSL_HAVE_ADDRESS_SANITIZER 1 #elif defined(__SANITIZE_ADDRESS__) #define ABSL_HAVE_ADDRESS_SANITIZER 1 #elif ABSL_HAVE_FEATURE(address_sanitizer) diff --git a/third_party/abseil-cpp/absl/base/dynamic_annotations.h b/third_party/abseil-cpp/absl/base/dynamic_annotations.h index bf874db990..065bd5be09 100644 --- a/third_party/abseil-cpp/absl/base/dynamic_annotations.h +++ b/third_party/abseil-cpp/absl/base/dynamic_annotations.h @@ -468,7 +468,7 @@ using absl::base_internal::ValgrindSlowdown; __sanitizer_annotate_contiguous_container(beg, end, old_mid, new_mid) #define ABSL_ADDRESS_SANITIZER_REDZONE(name) \ struct { \ - char x[8] __attribute__((aligned(8))); \ + alignas(8) char x[8]; \ } name #else diff --git a/third_party/abseil-cpp/absl/base/internal/exception_safety_testing.h b/third_party/abseil-cpp/absl/base/internal/exception_safety_testing.h index 6ba89d05df..77a5aec642 100644 --- a/third_party/abseil-cpp/absl/base/internal/exception_safety_testing.h +++ b/third_party/abseil-cpp/absl/base/internal/exception_safety_testing.h @@ -536,7 +536,22 @@ class ThrowingValue : private exceptions_internal::TrackedObject { } // Memory management operators - // Args.. allows us to overload regular and placement new in one shot + static void* operator new(size_t s) noexcept( + IsSpecified(TypeSpec::kNoThrowNew)) { + if (!IsSpecified(TypeSpec::kNoThrowNew)) { + exceptions_internal::MaybeThrow(ABSL_PRETTY_FUNCTION, true); + } + return ::operator new(s); + } + + static void* operator new[](size_t s) noexcept( + IsSpecified(TypeSpec::kNoThrowNew)) { + if (!IsSpecified(TypeSpec::kNoThrowNew)) { + exceptions_internal::MaybeThrow(ABSL_PRETTY_FUNCTION, true); + } + return ::operator new[](s); + } + template <typename... Args> static void* operator new(size_t s, Args&&... args) noexcept( IsSpecified(TypeSpec::kNoThrowNew)) { @@ -557,12 +572,6 @@ class ThrowingValue : private exceptions_internal::TrackedObject { // Abseil doesn't support throwing overloaded operator delete. These are // provided so a throwing operator-new can clean up after itself. - // - // We provide both regular and templated operator delete because if only the - // templated version is provided as we did with operator new, the compiler has - // no way of knowing which overload of operator delete to call. See - // https://en.cppreference.com/w/cpp/memory/new/operator_delete and - // https://en.cppreference.com/w/cpp/language/delete for the gory details. void operator delete(void* p) noexcept { ::operator delete(p); } template <typename... Args> @@ -726,9 +735,8 @@ class ThrowingAllocator : private exceptions_internal::TrackedObject { ThrowingAllocator select_on_container_copy_construction() noexcept( IsSpecified(AllocSpec::kNoThrowAllocate)) { - auto& out = *this; ReadStateAndMaybeThrow(ABSL_PRETTY_FUNCTION); - return out; + return *this; } template <typename U> diff --git a/third_party/abseil-cpp/absl/base/internal/sysinfo.cc b/third_party/abseil-cpp/absl/base/internal/sysinfo.cc index 4a3b205034..08a1e28894 100644 --- a/third_party/abseil-cpp/absl/base/internal/sysinfo.cc +++ b/third_party/abseil-cpp/absl/base/internal/sysinfo.cc @@ -61,9 +61,76 @@ namespace absl { ABSL_NAMESPACE_BEGIN namespace base_internal { +namespace { + +#if defined(_WIN32) + +// Returns number of bits set in `bitMask` +DWORD Win32CountSetBits(ULONG_PTR bitMask) { + for (DWORD bitSetCount = 0; ; ++bitSetCount) { + if (bitMask == 0) return bitSetCount; + bitMask &= bitMask - 1; + } +} + +// Returns the number of logical CPUs using GetLogicalProcessorInformation(), or +// 0 if the number of processors is not available or can not be computed. +// https://docs.microsoft.com/en-us/windows/win32/api/sysinfoapi/nf-sysinfoapi-getlogicalprocessorinformation +int Win32NumCPUs() { +#pragma comment(lib, "kernel32.lib") + using Info = SYSTEM_LOGICAL_PROCESSOR_INFORMATION; + + DWORD info_size = sizeof(Info); + Info* info(static_cast<Info*>(malloc(info_size))); + if (info == nullptr) return 0; + + bool success = GetLogicalProcessorInformation(info, &info_size); + if (!success && GetLastError() == ERROR_INSUFFICIENT_BUFFER) { + free(info); + info = static_cast<Info*>(malloc(info_size)); + if (info == nullptr) return 0; + success = GetLogicalProcessorInformation(info, &info_size); + } + + DWORD logicalProcessorCount = 0; + if (success) { + Info* ptr = info; + DWORD byteOffset = 0; + while (byteOffset + sizeof(Info) <= info_size) { + switch (ptr->Relationship) { + case RelationProcessorCore: + logicalProcessorCount += Win32CountSetBits(ptr->ProcessorMask); + break; + + case RelationNumaNode: + case RelationCache: + case RelationProcessorPackage: + // Ignore other entries + break; + + default: + // Ignore unknown entries + break; + } + byteOffset += sizeof(Info); + ptr++; + } + } + free(info); + return logicalProcessorCount; +} + +#endif + +} // namespace + + static int GetNumCPUs() { #if defined(__myriad2__) return 1; +#elif defined(_WIN32) + const unsigned hardware_concurrency = Win32NumCPUs(); + return hardware_concurrency ? hardware_concurrency : 1; #else // Other possibilities: // - Read /sys/devices/system/cpu/online and use cpumask_parse() diff --git a/third_party/abseil-cpp/absl/base/internal/thread_identity.cc b/third_party/abseil-cpp/absl/base/internal/thread_identity.cc index 6ea010ed0d..9950e63a79 100644 --- a/third_party/abseil-cpp/absl/base/internal/thread_identity.cc +++ b/third_party/abseil-cpp/absl/base/internal/thread_identity.cc @@ -120,10 +120,10 @@ void SetCurrentThreadIdentity( ABSL_THREAD_IDENTITY_MODE == ABSL_THREAD_IDENTITY_MODE_USE_CPP11 // Please see the comment on `CurrentThreadIdentityIfPresent` in -// thread_identity.h. Because DLLs cannot expose thread_local variables in -// headers, we opt for the correct-but-slower option of placing the definition -// of this function only in a translation unit inside DLL. -#if defined(ABSL_BUILD_DLL) || defined(ABSL_CONSUME_DLL) +// thread_identity.h. When we cannot expose thread_local variables in +// headers, we opt for the correct-but-slower option of not inlining this +// function. +#ifndef ABSL_INTERNAL_INLINE_CURRENT_THREAD_IDENTITY_IF_PRESENT ThreadIdentity* CurrentThreadIdentityIfPresent() { return thread_identity_ptr; } #endif #endif diff --git a/third_party/abseil-cpp/absl/base/internal/thread_identity.h b/third_party/abseil-cpp/absl/base/internal/thread_identity.h index 9ee651a3a6..6e25b92fa2 100644 --- a/third_party/abseil-cpp/absl/base/internal/thread_identity.h +++ b/third_party/abseil-cpp/absl/base/internal/thread_identity.h @@ -236,13 +236,18 @@ ABSL_CONST_INIT extern thread_local ThreadIdentity* thread_identity_ptr; #error Thread-local storage not detected on this platform #endif -// thread_local variables cannot be in headers exposed by DLLs. However, it is -// important for performance reasons in general that -// `CurrentThreadIdentityIfPresent` be inlined. This is not possible across a -// DLL boundary so, with DLLs, we opt to have the function not be inlined. Note +// thread_local variables cannot be in headers exposed by DLLs or in certain +// build configurations on Apple platforms. However, it is important for +// performance reasons in general that `CurrentThreadIdentityIfPresent` be +// inlined. In the other cases we opt to have the function not be inlined. Note // that `CurrentThreadIdentityIfPresent` is declared above so we can exclude -// this entire inline definition when compiling as a DLL. -#if !defined(ABSL_BUILD_DLL) && !defined(ABSL_CONSUME_DLL) +// this entire inline definition. +#if !defined(__APPLE__) && !defined(ABSL_BUILD_DLL) && \ + !defined(ABSL_CONSUME_DLL) +#define ABSL_INTERNAL_INLINE_CURRENT_THREAD_IDENTITY_IF_PRESENT 1 +#endif + +#ifdef ABSL_INTERNAL_INLINE_CURRENT_THREAD_IDENTITY_IF_PRESENT inline ThreadIdentity* CurrentThreadIdentityIfPresent() { return thread_identity_ptr; } diff --git a/third_party/abseil-cpp/absl/base/optimization.h b/third_party/abseil-cpp/absl/base/optimization.h index 6332b62584..d090be1286 100644 --- a/third_party/abseil-cpp/absl/base/optimization.h +++ b/third_party/abseil-cpp/absl/base/optimization.h @@ -106,9 +106,10 @@ // Cacheline aligning objects properly allows constructive memory sharing and // prevents destructive (or "false") memory sharing. // -// NOTE: this macro should be replaced with usage of `alignas()` using +// NOTE: callers should replace uses of this macro with `alignas()` using // `std::hardware_constructive_interference_size` and/or -// `std::hardware_destructive_interference_size` when available within C++17. +// `std::hardware_destructive_interference_size` when C++17 becomes available to +// them. // // See http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0154r1.html // for more information. diff --git a/third_party/abseil-cpp/absl/cleanup/CMakeLists.txt b/third_party/abseil-cpp/absl/cleanup/CMakeLists.txt index a2dd78a84a..26a6d0dce3 100644 --- a/third_party/abseil-cpp/absl/cleanup/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/cleanup/CMakeLists.txt @@ -51,5 +51,5 @@ absl_cc_test( absl::cleanup absl::config absl::utility - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/container/CMakeLists.txt b/third_party/abseil-cpp/absl/container/CMakeLists.txt index 2d7d0e65f2..91c4015437 100644 --- a/third_party/abseil-cpp/absl/container/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/container/CMakeLists.txt @@ -80,7 +80,7 @@ absl_cc_test( absl::strings absl::test_instance_tracker absl::type_traits - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -109,7 +109,7 @@ absl_cc_test( absl::optional absl::test_instance_tracker absl::utility - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -144,7 +144,7 @@ absl_cc_test( absl::exception_testing absl::hash_testing absl::memory - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -158,7 +158,7 @@ absl_cc_test( absl::fixed_array absl::config absl::exception_safety_testing - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -222,7 +222,7 @@ absl_cc_test( absl::memory absl::raw_logging_internal absl::strings - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -236,7 +236,7 @@ absl_cc_test( absl::inlined_vector absl::config absl::exception_safety_testing - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -262,7 +262,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::test_instance_tracker - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -297,7 +297,7 @@ absl_cc_test( absl::unordered_map_modifiers_test absl::any absl::raw_logging_internal - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -335,7 +335,7 @@ absl_cc_test( absl::memory absl::raw_logging_internal absl::strings - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -370,7 +370,7 @@ absl_cc_test( absl::unordered_map_lookup_test absl::unordered_map_members_test absl::unordered_map_modifiers_test - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -404,7 +404,7 @@ absl_cc_test( absl::unordered_set_lookup_test absl::unordered_set_members_test absl::unordered_set_modifiers_test - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -433,7 +433,7 @@ absl_cc_test( absl::container_memory absl::strings absl::test_instance_tracker - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -465,7 +465,7 @@ absl_cc_test( absl::hash absl::random_random absl::strings - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -507,7 +507,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::hash_policy_testing - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -531,7 +531,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::hash_policy_traits - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -561,7 +561,7 @@ absl_cc_test( DEPS absl::hashtablez_sampler absl::have_sse - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -618,7 +618,7 @@ absl_cc_test( DEPS absl::hash_policy_traits absl::node_hash_policy - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -693,7 +693,7 @@ absl_cc_test( absl::core_headers absl::raw_logging_internal absl::strings - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -707,7 +707,7 @@ absl_cc_test( absl::raw_hash_set absl::tracked absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -740,7 +740,7 @@ absl_cc_test( absl::core_headers absl::raw_logging_internal absl::span - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -765,7 +765,7 @@ absl_cc_library( DEPS absl::hash_generator_testing absl::hash_policy_testing - gmock + GTest::gmock TESTONLY ) @@ -779,7 +779,7 @@ absl_cc_library( DEPS absl::hash_generator_testing absl::hash_policy_testing - gmock + GTest::gmock TESTONLY ) @@ -792,7 +792,7 @@ absl_cc_library( ${ABSL_TEST_COPTS} DEPS absl::type_traits - gmock + GTest::gmock TESTONLY ) @@ -806,7 +806,7 @@ absl_cc_library( DEPS absl::hash_generator_testing absl::hash_policy_testing - gmock + GTest::gmock TESTONLY ) @@ -820,7 +820,7 @@ absl_cc_library( DEPS absl::hash_generator_testing absl::hash_policy_testing - gmock + GTest::gmock TESTONLY ) @@ -834,7 +834,7 @@ absl_cc_library( DEPS absl::hash_generator_testing absl::hash_policy_testing - gmock + GTest::gmock TESTONLY ) @@ -847,7 +847,7 @@ absl_cc_library( ${ABSL_TEST_COPTS} DEPS absl::type_traits - gmock + GTest::gmock TESTONLY ) @@ -861,7 +861,7 @@ absl_cc_library( DEPS absl::hash_generator_testing absl::hash_policy_testing - gmock + GTest::gmock TESTONLY ) @@ -877,7 +877,7 @@ absl_cc_test( absl::unordered_set_lookup_test absl::unordered_set_members_test absl::unordered_set_modifiers_test - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -892,5 +892,5 @@ absl_cc_test( absl::unordered_map_lookup_test absl::unordered_map_members_test absl::unordered_map_modifiers_test - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/container/btree_test.cc b/third_party/abseil-cpp/absl/container/btree_test.cc index 74337df2c1..d5d79151aa 100644 --- a/third_party/abseil-cpp/absl/container/btree_test.cc +++ b/third_party/abseil-cpp/absl/container/btree_test.cc @@ -1708,10 +1708,25 @@ TEST(Btree, StrSplitCompatible) { EXPECT_EQ(split_set, expected_set); } -// We can't use EXPECT_EQ/etc. to compare absl::weak_ordering because they -// convert literal 0 to int and absl::weak_ordering can only be compared with -// literal 0. Defining this function allows for avoiding ClangTidy warnings. -bool Identity(const bool b) { return b; } +TEST(Btree, KeyComp) { + absl::btree_set<int> s; + EXPECT_TRUE(s.key_comp()(1, 2)); + EXPECT_FALSE(s.key_comp()(2, 2)); + EXPECT_FALSE(s.key_comp()(2, 1)); + + absl::btree_map<int, int> m1; + EXPECT_TRUE(m1.key_comp()(1, 2)); + EXPECT_FALSE(m1.key_comp()(2, 2)); + EXPECT_FALSE(m1.key_comp()(2, 1)); + + // Even though we internally adapt the comparator of `m2` to be three-way and + // heterogeneous, the comparator we expose through key_comp() is the original + // unadapted comparator. + absl::btree_map<std::string, int> m2; + EXPECT_TRUE(m2.key_comp()("a", "b")); + EXPECT_FALSE(m2.key_comp()("b", "b")); + EXPECT_FALSE(m2.key_comp()("b", "a")); +} TEST(Btree, ValueComp) { absl::btree_set<int> s; @@ -1724,13 +1739,13 @@ TEST(Btree, ValueComp) { EXPECT_FALSE(m1.value_comp()(std::make_pair(2, 0), std::make_pair(2, 0))); EXPECT_FALSE(m1.value_comp()(std::make_pair(2, 0), std::make_pair(1, 0))); + // Even though we internally adapt the comparator of `m2` to be three-way and + // heterogeneous, the comparator we expose through value_comp() is based on + // the original unadapted comparator. absl::btree_map<std::string, int> m2; - EXPECT_TRUE(Identity( - m2.value_comp()(std::make_pair("a", 0), std::make_pair("b", 0)) < 0)); - EXPECT_TRUE(Identity( - m2.value_comp()(std::make_pair("b", 0), std::make_pair("b", 0)) == 0)); - EXPECT_TRUE(Identity( - m2.value_comp()(std::make_pair("b", 0), std::make_pair("a", 0)) > 0)); + EXPECT_TRUE(m2.value_comp()(std::make_pair("a", 0), std::make_pair("b", 0))); + EXPECT_FALSE(m2.value_comp()(std::make_pair("b", 0), std::make_pair("b", 0))); + EXPECT_FALSE(m2.value_comp()(std::make_pair("b", 0), std::make_pair("a", 0))); } TEST(Btree, DefaultConstruction) { @@ -2893,6 +2908,46 @@ TEST(Btree, AllocMoveConstructor_DifferentAlloc) { EXPECT_EQ(bytes_used2, original_bytes_used); } +bool IntCmp(const int a, const int b) { return a < b; } + +TEST(Btree, SupportsFunctionPtrComparator) { + absl::btree_set<int, decltype(IntCmp) *> set(IntCmp); + set.insert({1, 2, 3}); + EXPECT_THAT(set, ElementsAre(1, 2, 3)); + EXPECT_TRUE(set.key_comp()(1, 2)); + EXPECT_TRUE(set.value_comp()(1, 2)); + + absl::btree_map<int, int, decltype(IntCmp) *> map(&IntCmp); + map[1] = 1; + EXPECT_THAT(map, ElementsAre(Pair(1, 1))); + EXPECT_TRUE(map.key_comp()(1, 2)); + EXPECT_TRUE(map.value_comp()(std::make_pair(1, 1), std::make_pair(2, 2))); +} + +template <typename Compare> +struct TransparentPassThroughComp { + using is_transparent = void; + + // This will fail compilation if we attempt a comparison that Compare does not + // support, and the failure will happen inside the function implementation so + // it can't be avoided by using SFINAE on this comparator. + template <typename T, typename U> + bool operator()(const T &lhs, const U &rhs) const { + return Compare()(lhs, rhs); + } +}; + +TEST(Btree, + SupportsTransparentComparatorThatDoesNotImplementAllVisibleOperators) { + absl::btree_set<MultiKey, TransparentPassThroughComp<MultiKeyComp>> set; + set.insert(MultiKey{1, 2}); + EXPECT_TRUE(set.contains(1)); +} + +TEST(Btree, ConstructImplicitlyWithUnadaptedComparator) { + absl::btree_set<MultiKey, MultiKeyComp> set = {{}, MultiKeyComp{}}; +} + } // namespace } // namespace container_internal ABSL_NAMESPACE_END diff --git a/third_party/abseil-cpp/absl/container/flat_hash_map_test.cc b/third_party/abseil-cpp/absl/container/flat_hash_map_test.cc index 89ec60c916..8dda1d3539 100644 --- a/third_party/abseil-cpp/absl/container/flat_hash_map_test.cc +++ b/third_party/abseil-cpp/absl/container/flat_hash_map_test.cc @@ -282,6 +282,32 @@ TEST(FlatHashMap, NodeHandleMutableKeyAccess) { } #endif +TEST(FlatHashMap, Reserve) { + // Verify that if we reserve(size() + n) then we can perform n insertions + // without a rehash, i.e., without invalidating any references. + for (size_t trial = 0; trial < 20; ++trial) { + for (size_t initial = 3; initial < 100; ++initial) { + // Fill in `initial` entries, then erase 2 of them, then reserve space for + // two inserts and check for reference stability while doing the inserts. + flat_hash_map<size_t, size_t> map; + for (size_t i = 0; i < initial; ++i) { + map[i] = i; + } + map.erase(0); + map.erase(1); + map.reserve(map.size() + 2); + size_t& a2 = map[2]; + // In the event of a failure, asan will complain in one of these two + // assignments. + map[initial] = a2; + map[initial + 1] = a2; + // Fail even when not under asan: + size_t& a2new = map[2]; + EXPECT_EQ(&a2, &a2new); + } + } +} + } // namespace } // namespace container_internal ABSL_NAMESPACE_END diff --git a/third_party/abseil-cpp/absl/container/internal/btree.h b/third_party/abseil-cpp/absl/container/internal/btree.h index 00444a5397..f636c5fc73 100644 --- a/third_party/abseil-cpp/absl/container/internal/btree.h +++ b/third_party/abseil-cpp/absl/container/internal/btree.h @@ -88,7 +88,12 @@ struct StringBtreeDefaultLess { // Compatibility constructor. StringBtreeDefaultLess(std::less<std::string>) {} // NOLINT - StringBtreeDefaultLess(std::less<string_view>) {} // NOLINT + StringBtreeDefaultLess(std::less<absl::string_view>) {} // NOLINT + + // Allow converting to std::less for use in key_comp()/value_comp(). + explicit operator std::less<std::string>() const { return {}; } + explicit operator std::less<absl::string_view>() const { return {}; } + explicit operator std::less<absl::Cord>() const { return {}; } absl::weak_ordering operator()(absl::string_view lhs, absl::string_view rhs) const { @@ -115,7 +120,12 @@ struct StringBtreeDefaultGreater { StringBtreeDefaultGreater() = default; StringBtreeDefaultGreater(std::greater<std::string>) {} // NOLINT - StringBtreeDefaultGreater(std::greater<string_view>) {} // NOLINT + StringBtreeDefaultGreater(std::greater<absl::string_view>) {} // NOLINT + + // Allow converting to std::greater for use in key_comp()/value_comp(). + explicit operator std::greater<std::string>() const { return {}; } + explicit operator std::greater<absl::string_view>() const { return {}; } + explicit operator std::greater<absl::Cord>() const { return {}; } absl::weak_ordering operator()(absl::string_view lhs, absl::string_view rhs) const { @@ -217,6 +227,8 @@ struct prefers_linear_node_search< template <typename Key, typename Compare, typename Alloc, int TargetNodeSize, bool Multi, typename SlotPolicy> struct common_params { + using original_key_compare = Compare; + // If Compare is a common comparator for a string-like type, then we adapt it // to use heterogeneous lookup and to be a key-compare-to comparator. using key_compare = typename key_compare_to_adapter<Compare>::type; @@ -317,16 +329,21 @@ struct map_params : common_params<Key, Compare, Alloc, TargetNodeSize, Multi, using value_type = typename super_type::value_type; using init_type = typename super_type::init_type; - using key_compare = typename super_type::key_compare; - // Inherit from key_compare for empty base class optimization. - struct value_compare : private key_compare { - value_compare() = default; - explicit value_compare(const key_compare &cmp) : key_compare(cmp) {} + using original_key_compare = typename super_type::original_key_compare; + // Reference: https://en.cppreference.com/w/cpp/container/map/value_compare + class value_compare { + template <typename Params> + friend class btree; - template <typename T, typename U> - auto operator()(const T &left, const U &right) const - -> decltype(std::declval<key_compare>()(left.first, right.first)) { - return key_compare::operator()(left.first, right.first); + protected: + explicit value_compare(original_key_compare c) : comp(std::move(c)) {} + + original_key_compare comp; // NOLINT + + public: + auto operator()(const value_type &lhs, const value_type &rhs) const + -> decltype(comp(lhs.first, rhs.first)) { + return comp(lhs.first, rhs.first); } }; using is_map_container = std::true_type; @@ -392,7 +409,8 @@ struct set_params : common_params<Key, Compare, Alloc, TargetNodeSize, Multi, set_slot_policy<Key>> { using value_type = Key; using slot_type = typename set_params::common_params::slot_type; - using value_compare = typename set_params::common_params::key_compare; + using value_compare = + typename set_params::common_params::original_key_compare; using is_map_container = std::false_type; template <typename V> @@ -484,8 +502,8 @@ class btree_node { std::is_same<std::greater<key_type>, key_compare>::value)>; - // This class is organized by gtl::Layout as if it had the following - // structure: + // This class is organized by absl::container_internal::Layout as if it had + // the following structure: // // A pointer to the node's parent. // btree_node *parent; // @@ -579,10 +597,10 @@ class btree_node { }; // Leaves can have less than kNodeSlots values. - constexpr static layout_type LeafLayout(const int slots = kNodeSlots) { + constexpr static layout_type LeafLayout(const int slot_count = kNodeSlots) { return layout_type(/*parent*/ 1, /*position, start, finish, max_count*/ 4, - /*slots*/ slots, + /*slots*/ slot_count, /*children*/ 0); } constexpr static layout_type InternalLayout() { @@ -591,8 +609,8 @@ class btree_node { /*slots*/ kNodeSlots, /*children*/ kNodeSlots + 1); } - constexpr static size_type LeafSize(const int slots = kNodeSlots) { - return LeafLayout(slots).AllocSize(); + constexpr static size_type LeafSize(const int slot_count = kNodeSlots) { + return LeafLayout(slot_count).AllocSize(); } constexpr static size_type InternalSize() { return InternalLayout().AllocSize(); @@ -1129,6 +1147,7 @@ class btree { using size_type = typename Params::size_type; using difference_type = typename Params::difference_type; using key_compare = typename Params::key_compare; + using original_key_compare = typename Params::original_key_compare; using value_compare = typename Params::value_compare; using allocator_type = typename Params::allocator_type; using reference = typename Params::reference; @@ -1338,7 +1357,9 @@ class btree { return compare_internal::compare_result_as_less_than(key_comp()(a, b)); } - value_compare value_comp() const { return value_compare(key_comp()); } + value_compare value_comp() const { + return value_compare(original_key_compare(key_comp())); + } // Verifies the structure of the btree. void verify() const; diff --git a/third_party/abseil-cpp/absl/container/internal/btree_container.h b/third_party/abseil-cpp/absl/container/internal/btree_container.h index 03be708e4f..a99668c713 100644 --- a/third_party/abseil-cpp/absl/container/internal/btree_container.h +++ b/third_party/abseil-cpp/absl/container/internal/btree_container.h @@ -20,6 +20,7 @@ #include <iterator> #include <utility> +#include "absl/base/attributes.h" #include "absl/base/internal/throw_delegate.h" #include "absl/container/internal/btree.h" // IWYU pragma: export #include "absl/container/internal/common.h" @@ -51,7 +52,7 @@ class btree_container { using value_type = typename Tree::value_type; using size_type = typename Tree::size_type; using difference_type = typename Tree::difference_type; - using key_compare = typename Tree::key_compare; + using key_compare = typename Tree::original_key_compare; using value_compare = typename Tree::value_compare; using allocator_type = typename Tree::allocator_type; using reference = typename Tree::reference; @@ -176,7 +177,7 @@ class btree_container { } // Utility routines. - void clear() { tree_.clear(); } + ABSL_ATTRIBUTE_REINITIALIZES void clear() { tree_.clear(); } void swap(btree_container &other) { tree_.swap(other.tree_); } void verify() const { tree_.verify(); } @@ -214,7 +215,7 @@ class btree_container { allocator_type get_allocator() const { return tree_.get_allocator(); } // The key comparator used by the btree. - key_compare key_comp() const { return tree_.key_comp(); } + key_compare key_comp() const { return key_compare(tree_.key_comp()); } value_compare value_comp() const { return tree_.value_comp(); } // Support absl::Hash. @@ -247,7 +248,7 @@ class btree_set_container : public btree_container<Tree> { using key_type = typename Tree::key_type; using value_type = typename Tree::value_type; using size_type = typename Tree::size_type; - using key_compare = typename Tree::key_compare; + using key_compare = typename Tree::original_key_compare; using allocator_type = typename Tree::allocator_type; using iterator = typename Tree::iterator; using const_iterator = typename Tree::const_iterator; @@ -398,7 +399,7 @@ class btree_map_container : public btree_set_container<Tree> { using key_type = typename Tree::key_type; using mapped_type = typename params_type::mapped_type; using value_type = typename Tree::value_type; - using key_compare = typename Tree::key_compare; + using key_compare = typename Tree::original_key_compare; using allocator_type = typename Tree::allocator_type; using iterator = typename Tree::iterator; using const_iterator = typename Tree::const_iterator; @@ -543,7 +544,7 @@ class btree_multiset_container : public btree_container<Tree> { using key_type = typename Tree::key_type; using value_type = typename Tree::value_type; using size_type = typename Tree::size_type; - using key_compare = typename Tree::key_compare; + using key_compare = typename Tree::original_key_compare; using allocator_type = typename Tree::allocator_type; using iterator = typename Tree::iterator; using const_iterator = typename Tree::const_iterator; diff --git a/third_party/abseil-cpp/absl/container/internal/hash_generator_testing.h b/third_party/abseil-cpp/absl/container/internal/hash_generator_testing.h index 6869fe45e8..f1f555a5c1 100644 --- a/third_party/abseil-cpp/absl/container/internal/hash_generator_testing.h +++ b/third_party/abseil-cpp/absl/container/internal/hash_generator_testing.h @@ -21,11 +21,13 @@ #include <stdint.h> #include <algorithm> +#include <cassert> #include <iosfwd> #include <random> #include <tuple> #include <type_traits> #include <utility> +#include <vector> #include "absl/container/internal/hash_policy_testing.h" #include "absl/memory/memory.h" @@ -153,6 +155,25 @@ using GeneratedType = decltype( typename Container::value_type, typename Container::key_type>::type>&>()()); +// Naive wrapper that performs a linear search of previous values. +// Beware this is O(SQR), which is reasonable for smaller kMaxValues. +template <class T, size_t kMaxValues = 64, class E = void> +struct UniqueGenerator { + Generator<T, E> gen; + std::vector<T> values; + + T operator()() { + assert(values.size() < kMaxValues); + for (;;) { + T value = gen(); + if (std::find(values.begin(), values.end(), value) == values.end()) { + values.push_back(value); + return value; + } + } + } +}; + } // namespace hash_internal } // namespace container_internal ABSL_NAMESPACE_END diff --git a/third_party/abseil-cpp/absl/container/internal/inlined_vector.h b/third_party/abseil-cpp/absl/container/internal/inlined_vector.h index b8aec45b79..49822af0b7 100644 --- a/third_party/abseil-cpp/absl/container/internal/inlined_vector.h +++ b/third_party/abseil-cpp/absl/container/internal/inlined_vector.h @@ -36,6 +36,7 @@ namespace inlined_vector_internal { // GCC does not deal very well with the below code #if !defined(__clang__) && defined(__GNUC__) #pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -955,7 +956,7 @@ auto Storage<T, N, A>::Swap(Storage* other_storage_ptr) -> void { swap(*GetAllocPtr(), *other_storage_ptr->GetAllocPtr()); } -// End ignore "maybe-uninitialized" +// End ignore "array-bounds" and "maybe-uninitialized" #if !defined(__clang__) && defined(__GNUC__) #pragma GCC diagnostic pop #endif diff --git a/third_party/abseil-cpp/absl/container/internal/layout.h b/third_party/abseil-cpp/absl/container/internal/layout.h index 2336783315..a59a243059 100644 --- a/third_party/abseil-cpp/absl/container/internal/layout.h +++ b/third_party/abseil-cpp/absl/container/internal/layout.h @@ -404,7 +404,7 @@ class LayoutImpl<std::tuple<Elements...>, absl::index_sequence<SizeSeq...>, constexpr size_t Offset() const { static_assert(N < NumOffsets, "Index out of bounds"); return adl_barrier::Align( - Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1], + Offset<N - 1>() + SizeOf<ElementType<N - 1>>::value * size_[N - 1], ElementAlignment<N>::value); } @@ -597,7 +597,7 @@ class LayoutImpl<std::tuple<Elements...>, absl::index_sequence<SizeSeq...>, constexpr size_t AllocSize() const { static_assert(NumTypes == NumSizes, "You must specify sizes of all fields"); return Offset<NumTypes - 1>() + - SizeOf<ElementType<NumTypes - 1>>() * size_[NumTypes - 1]; + SizeOf<ElementType<NumTypes - 1>>::value * size_[NumTypes - 1]; } // If built with --config=asan, poisons padding bytes (if any) in the @@ -621,7 +621,7 @@ class LayoutImpl<std::tuple<Elements...>, absl::index_sequence<SizeSeq...>, // The `if` is an optimization. It doesn't affect the observable behaviour. if (ElementAlignment<N - 1>::value % ElementAlignment<N>::value) { size_t start = - Offset<N - 1>() + SizeOf<ElementType<N - 1>>() * size_[N - 1]; + Offset<N - 1>() + SizeOf<ElementType<N - 1>>::value * size_[N - 1]; ASAN_POISON_MEMORY_REGION(p + start, Offset<N>() - start); } #endif @@ -645,7 +645,7 @@ class LayoutImpl<std::tuple<Elements...>, absl::index_sequence<SizeSeq...>, // produce "unsigned*" where another produces "unsigned int *". std::string DebugString() const { const auto offsets = Offsets(); - const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>()...}; + const size_t sizes[] = {SizeOf<ElementType<OffsetSeq>>::value...}; const std::string types[] = { adl_barrier::TypeName<ElementType<OffsetSeq>>()...}; std::string res = absl::StrCat("@0", types[0], "(", sizes[0], ")"); diff --git a/third_party/abseil-cpp/absl/container/internal/raw_hash_map.h b/third_party/abseil-cpp/absl/container/internal/raw_hash_map.h index 0a02757ddf..c7df2efc62 100644 --- a/third_party/abseil-cpp/absl/container/internal/raw_hash_map.h +++ b/third_party/abseil-cpp/absl/container/internal/raw_hash_map.h @@ -51,8 +51,9 @@ class raw_hash_map : public raw_hash_set<Policy, Hash, Eq, Alloc> { using key_arg = typename KeyArgImpl::template type<K, key_type>; static_assert(!std::is_reference<key_type>::value, ""); - // TODO(alkis): remove this assertion and verify that reference mapped_type is - // supported. + + // TODO(b/187807849): Evaluate whether to support reference mapped_type and + // remove this assertion if/when it is supported. static_assert(!std::is_reference<mapped_type>::value, ""); using iterator = typename raw_hash_map::raw_hash_set::iterator; diff --git a/third_party/abseil-cpp/absl/container/internal/raw_hash_set.h b/third_party/abseil-cpp/absl/container/internal/raw_hash_set.h index 80fc2cba3f..aa78265ca1 100644 --- a/third_party/abseil-cpp/absl/container/internal/raw_hash_set.h +++ b/third_party/abseil-cpp/absl/container/internal/raw_hash_set.h @@ -628,7 +628,9 @@ class raw_hash_set { static Layout MakeLayout(size_t capacity) { assert(IsValidCapacity(capacity)); - return Layout(capacity + Group::kWidth + 1, capacity); + // The extra control bytes are for 1 sentinel byte followed by + // `Group::kWidth - 1` bytes that are cloned from the beginning. + return Layout(capacity + Group::kWidth, capacity); } using AllocTraits = absl::allocator_traits<allocator_type>; @@ -792,7 +794,8 @@ class raw_hash_set { explicit raw_hash_set(size_t bucket_count, const hasher& hash = hasher(), const key_equal& eq = key_equal(), const allocator_type& alloc = allocator_type()) - : ctrl_(EmptyGroup()), settings_(0, hash, eq, alloc) { + : ctrl_(EmptyGroup()), + settings_(0, HashtablezInfoHandle(), hash, eq, alloc) { if (bucket_count) { capacity_ = NormalizeCapacity(bucket_count); initialize_slots(); @@ -903,7 +906,7 @@ class raw_hash_set { auto target = find_first_non_full(ctrl_, hash, capacity_); set_ctrl(target.offset, H2(hash)); emplace_at(target.offset, v); - infoz_.RecordInsert(hash, target.probe_length); + infoz().RecordInsert(hash, target.probe_length); } size_ = that.size(); growth_left() -= that.size(); @@ -917,28 +920,27 @@ class raw_hash_set { slots_(absl::exchange(that.slots_, nullptr)), size_(absl::exchange(that.size_, 0)), capacity_(absl::exchange(that.capacity_, 0)), - infoz_(absl::exchange(that.infoz_, HashtablezInfoHandle())), // Hash, equality and allocator are copied instead of moved because // `that` must be left valid. If Hash is std::function<Key>, moving it // would create a nullptr functor that cannot be called. - settings_(that.settings_) { - // growth_left was copied above, reset the one from `that`. - that.growth_left() = 0; - } + settings_(absl::exchange(that.growth_left(), 0), + absl::exchange(that.infoz(), HashtablezInfoHandle()), + that.hash_ref(), that.eq_ref(), that.alloc_ref()) {} raw_hash_set(raw_hash_set&& that, const allocator_type& a) : ctrl_(EmptyGroup()), slots_(nullptr), size_(0), capacity_(0), - settings_(0, that.hash_ref(), that.eq_ref(), a) { + settings_(0, HashtablezInfoHandle(), that.hash_ref(), that.eq_ref(), + a) { if (a == that.alloc_ref()) { std::swap(ctrl_, that.ctrl_); std::swap(slots_, that.slots_); std::swap(size_, that.size_); std::swap(capacity_, that.capacity_); std::swap(growth_left(), that.growth_left()); - std::swap(infoz_, that.infoz_); + std::swap(infoz(), that.infoz()); } else { reserve(that.size()); // Note: this will copy elements of dense_set and unordered_set instead of @@ -1009,7 +1011,7 @@ class raw_hash_set { reset_growth_left(); } assert(empty()); - infoz_.RecordStorageChanged(0, capacity_); + infoz().RecordStorageChanged(0, capacity_); } // This overload kicks in when the argument is an rvalue of insertable and @@ -1301,7 +1303,7 @@ class raw_hash_set { swap(growth_left(), that.growth_left()); swap(hash_ref(), that.hash_ref()); swap(eq_ref(), that.eq_ref()); - swap(infoz_, that.infoz_); + swap(infoz(), that.infoz()); SwapAlloc(alloc_ref(), that.alloc_ref(), typename AllocTraits::propagate_on_container_swap{}); } @@ -1310,7 +1312,7 @@ class raw_hash_set { if (n == 0 && capacity_ == 0) return; if (n == 0 && size_ == 0) { destroy_slots(); - infoz_.RecordStorageChanged(0, 0); + infoz().RecordStorageChanged(0, 0); return; } // bitor is a faster way of doing `max` here. We will round up to the next @@ -1323,8 +1325,8 @@ class raw_hash_set { } void reserve(size_t n) { - size_t m = GrowthToLowerboundCapacity(n); - if (m > capacity_) { + if (n > size() + growth_left()) { + size_t m = GrowthToLowerboundCapacity(n); resize(NormalizeCapacity(m)); } } @@ -1528,7 +1530,7 @@ class raw_hash_set { set_ctrl(index, was_never_full ? kEmpty : kDeleted); growth_left() += was_never_full; - infoz_.RecordErase(); + infoz().RecordErase(); } void initialize_slots() { @@ -1545,17 +1547,17 @@ class raw_hash_set { // bound more carefully. if (std::is_same<SlotAlloc, std::allocator<slot_type>>::value && slots_ == nullptr) { - infoz_ = Sample(); + infoz() = Sample(); } auto layout = MakeLayout(capacity_); char* mem = static_cast<char*>( Allocate<Layout::Alignment()>(&alloc_ref(), layout.AllocSize())); - ctrl_ = reinterpret_cast<ctrl_t*>(layout.template Pointer<0>(mem)); + ctrl_ = layout.template Pointer<0>(mem); slots_ = layout.template Pointer<1>(mem); reset_ctrl(); reset_growth_left(); - infoz_.RecordStorageChanged(size_, capacity_); + infoz().RecordStorageChanged(size_, capacity_); } void destroy_slots() { @@ -1603,7 +1605,7 @@ class raw_hash_set { Deallocate<Layout::Alignment()>(&alloc_ref(), old_ctrl, layout.AllocSize()); } - infoz_.RecordRehash(total_probe_length); + infoz().RecordRehash(total_probe_length); } void drop_deletes_without_resize() ABSL_ATTRIBUTE_NOINLINE { @@ -1669,7 +1671,7 @@ class raw_hash_set { } } reset_growth_left(); - infoz_.RecordRehash(total_probe_length); + infoz().RecordRehash(total_probe_length); } void rehash_and_grow_if_necessary() { @@ -1743,7 +1745,7 @@ class raw_hash_set { ++size_; growth_left() -= IsEmpty(ctrl_[target.offset]); set_ctrl(target.offset, H2(hash)); - infoz_.RecordInsert(hash, target.probe_length); + infoz().RecordInsert(hash, target.probe_length); return target.offset; } @@ -1782,8 +1784,8 @@ class raw_hash_set { growth_left() = CapacityToGrowth(capacity()) - size_; } - // Sets the control byte, and if `i < Group::kWidth`, set the cloned byte at - // the end too. + // Sets the control byte, and if `i < Group::kWidth - 1`, set the cloned byte + // at the end too. void set_ctrl(size_t i, ctrl_t h) { assert(i < capacity_); @@ -1794,32 +1796,35 @@ class raw_hash_set { } ctrl_[i] = h; - ctrl_[((i - Group::kWidth) & capacity_) + 1 + - ((Group::kWidth - 1) & capacity_)] = h; + constexpr size_t kClonedBytes = Group::kWidth - 1; + ctrl_[((i - kClonedBytes) & capacity_) + (kClonedBytes & capacity_)] = h; } size_t& growth_left() { return settings_.template get<0>(); } - hasher& hash_ref() { return settings_.template get<1>(); } - const hasher& hash_ref() const { return settings_.template get<1>(); } - key_equal& eq_ref() { return settings_.template get<2>(); } - const key_equal& eq_ref() const { return settings_.template get<2>(); } - allocator_type& alloc_ref() { return settings_.template get<3>(); } + HashtablezInfoHandle& infoz() { return settings_.template get<1>(); } + + hasher& hash_ref() { return settings_.template get<2>(); } + const hasher& hash_ref() const { return settings_.template get<2>(); } + key_equal& eq_ref() { return settings_.template get<3>(); } + const key_equal& eq_ref() const { return settings_.template get<3>(); } + allocator_type& alloc_ref() { return settings_.template get<4>(); } const allocator_type& alloc_ref() const { - return settings_.template get<3>(); + return settings_.template get<4>(); } // TODO(alkis): Investigate removing some of these fields: // - ctrl/slots can be derived from each other // - size can be moved into the slot array - ctrl_t* ctrl_ = EmptyGroup(); // [(capacity + 1) * ctrl_t] + ctrl_t* ctrl_ = EmptyGroup(); // [(capacity + Group::kWidth) * ctrl_t] slot_type* slots_ = nullptr; // [capacity * slot_type] size_t size_ = 0; // number of full slots size_t capacity_ = 0; // total number of slots - HashtablezInfoHandle infoz_; - absl::container_internal::CompressedTuple<size_t /* growth_left */, hasher, + absl::container_internal::CompressedTuple<size_t /* growth_left */, + HashtablezInfoHandle, hasher, key_equal, allocator_type> - settings_{0, hasher{}, key_equal{}, allocator_type{}}; + settings_{0, HashtablezInfoHandle{}, hasher{}, key_equal{}, + allocator_type{}}; }; // Erases all elements that satisfy the predicate `pred` from the container `c`. diff --git a/third_party/abseil-cpp/absl/container/internal/raw_hash_set_test.cc b/third_party/abseil-cpp/absl/container/internal/raw_hash_set_test.cc index 81c4b47c04..af882ef49f 100644 --- a/third_party/abseil-cpp/absl/container/internal/raw_hash_set_test.cc +++ b/third_party/abseil-cpp/absl/container/internal/raw_hash_set_test.cc @@ -419,6 +419,13 @@ TEST(Table, EmptyFunctorOptimization) { size_t growth_left; void* infoz; }; + struct MockTableInfozDisabled { + void* ctrl; + void* slots; + size_t size; + size_t capacity; + size_t growth_left; + }; struct StatelessHash { size_t operator()(absl::string_view) const { return 0; } }; @@ -426,17 +433,27 @@ TEST(Table, EmptyFunctorOptimization) { size_t dummy; }; - EXPECT_EQ( - sizeof(MockTable), - sizeof( - raw_hash_set<StringPolicy, StatelessHash, - std::equal_to<absl::string_view>, std::allocator<int>>)); + if (std::is_empty<HashtablezInfoHandle>::value) { + EXPECT_EQ(sizeof(MockTableInfozDisabled), + sizeof(raw_hash_set<StringPolicy, StatelessHash, + std::equal_to<absl::string_view>, + std::allocator<int>>)); + + EXPECT_EQ(sizeof(MockTableInfozDisabled) + sizeof(StatefulHash), + sizeof(raw_hash_set<StringPolicy, StatefulHash, + std::equal_to<absl::string_view>, + std::allocator<int>>)); + } else { + EXPECT_EQ(sizeof(MockTable), + sizeof(raw_hash_set<StringPolicy, StatelessHash, + std::equal_to<absl::string_view>, + std::allocator<int>>)); - EXPECT_EQ( - sizeof(MockTable) + sizeof(StatefulHash), - sizeof( - raw_hash_set<StringPolicy, StatefulHash, - std::equal_to<absl::string_view>, std::allocator<int>>)); + EXPECT_EQ(sizeof(MockTable) + sizeof(StatefulHash), + sizeof(raw_hash_set<StringPolicy, StatefulHash, + std::equal_to<absl::string_view>, + std::allocator<int>>)); + } } TEST(Table, Empty) { @@ -524,6 +541,37 @@ TEST(Table, InsertCollisionAndFindAfterDelete) { EXPECT_TRUE(t.empty()); } +TEST(Table, InsertWithinCapacity) { + IntTable t; + t.reserve(10); + const size_t original_capacity = t.capacity(); + const auto addr = [&](int i) { + return reinterpret_cast<uintptr_t>(&*t.find(i)); + }; + // Inserting an element does not change capacity. + t.insert(0); + EXPECT_THAT(t.capacity(), original_capacity); + const uintptr_t original_addr_0 = addr(0); + // Inserting another element does not rehash. + t.insert(1); + EXPECT_THAT(t.capacity(), original_capacity); + EXPECT_THAT(addr(0), original_addr_0); + // Inserting lots of duplicate elements does not rehash. + for (int i = 0; i < 100; ++i) { + t.insert(i % 10); + } + EXPECT_THAT(t.capacity(), original_capacity); + EXPECT_THAT(addr(0), original_addr_0); + // Inserting a range of duplicate elements does not rehash. + std::vector<int> dup_range; + for (int i = 0; i < 100; ++i) { + dup_range.push_back(i % 10); + } + t.insert(dup_range.begin(), dup_range.end()); + EXPECT_THAT(t.capacity(), original_capacity); + EXPECT_THAT(addr(0), original_addr_0); +} + TEST(Table, LazyEmplace) { StringTable t; bool called = false; diff --git a/third_party/abseil-cpp/absl/container/internal/unordered_map_constructor_test.h b/third_party/abseil-cpp/absl/container/internal/unordered_map_constructor_test.h index 3f90ad7ca8..c1d20f3c52 100644 --- a/third_party/abseil-cpp/absl/container/internal/unordered_map_constructor_test.h +++ b/third_party/abseil-cpp/absl/container/internal/unordered_map_constructor_test.h @@ -179,7 +179,7 @@ TYPED_TEST_P(ConstructorTest, InputIteratorBucketHashEqualAlloc) { A alloc(0); std::vector<T> values; std::generate_n(std::back_inserter(values), 10, - hash_internal::Generator<T>()); + hash_internal::UniqueGenerator<T>()); TypeParam m(values.begin(), values.end(), 123, hasher, equal, alloc); EXPECT_EQ(m.hash_function(), hasher); EXPECT_EQ(m.key_eq(), equal); @@ -198,7 +198,7 @@ void InputIteratorBucketAllocTest(std::true_type) { A alloc(0); std::vector<T> values; std::generate_n(std::back_inserter(values), 10, - hash_internal::Generator<T>()); + hash_internal::UniqueGenerator<T>()); TypeParam m(values.begin(), values.end(), 123, alloc); EXPECT_EQ(m.get_allocator(), alloc); EXPECT_THAT(items(m), ::testing::UnorderedElementsAreArray(values)); @@ -221,7 +221,7 @@ void InputIteratorBucketHashAllocTest(std::true_type) { A alloc(0); std::vector<T> values; std::generate_n(std::back_inserter(values), 10, - hash_internal::Generator<T>()); + hash_internal::UniqueGenerator<T>()); TypeParam m(values.begin(), values.end(), 123, hasher, alloc); EXPECT_EQ(m.hash_function(), hasher); EXPECT_EQ(m.get_allocator(), alloc); @@ -241,8 +241,9 @@ TYPED_TEST_P(ConstructorTest, CopyConstructor) { H hasher; E equal; A alloc(0); + hash_internal::UniqueGenerator<T> gen; TypeParam m(123, hasher, equal, alloc); - for (size_t i = 0; i != 10; ++i) m.insert(hash_internal::Generator<T>()()); + for (size_t i = 0; i != 10; ++i) m.insert(gen()); TypeParam n(m); EXPECT_EQ(m.hash_function(), n.hash_function()); EXPECT_EQ(m.key_eq(), n.key_eq()); @@ -262,8 +263,9 @@ void CopyConstructorAllocTest(std::true_type) { H hasher; E equal; A alloc(0); + hash_internal::UniqueGenerator<T> gen; TypeParam m(123, hasher, equal, alloc); - for (size_t i = 0; i != 10; ++i) m.insert(hash_internal::Generator<T>()()); + for (size_t i = 0; i != 10; ++i) m.insert(gen()); TypeParam n(m, A(11)); EXPECT_EQ(m.hash_function(), n.hash_function()); EXPECT_EQ(m.key_eq(), n.key_eq()); @@ -285,8 +287,9 @@ TYPED_TEST_P(ConstructorTest, MoveConstructor) { H hasher; E equal; A alloc(0); + hash_internal::UniqueGenerator<T> gen; TypeParam m(123, hasher, equal, alloc); - for (size_t i = 0; i != 10; ++i) m.insert(hash_internal::Generator<T>()()); + for (size_t i = 0; i != 10; ++i) m.insert(gen()); TypeParam t(m); TypeParam n(std::move(t)); EXPECT_EQ(m.hash_function(), n.hash_function()); @@ -307,8 +310,9 @@ void MoveConstructorAllocTest(std::true_type) { H hasher; E equal; A alloc(0); + hash_internal::UniqueGenerator<T> gen; TypeParam m(123, hasher, equal, alloc); - for (size_t i = 0; i != 10; ++i) m.insert(hash_internal::Generator<T>()()); + for (size_t i = 0; i != 10; ++i) m.insert(gen()); TypeParam t(m); TypeParam n(std::move(t), A(1)); EXPECT_EQ(m.hash_function(), n.hash_function()); @@ -325,7 +329,7 @@ TYPED_TEST_P(ConstructorTest, MoveConstructorAlloc) { TYPED_TEST_P(ConstructorTest, InitializerListBucketHashEqualAlloc) { using T = hash_internal::GeneratedType<TypeParam>; - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()}; using H = typename TypeParam::hasher; using E = typename TypeParam::key_equal; @@ -348,7 +352,7 @@ template <typename TypeParam> void InitializerListBucketAllocTest(std::true_type) { using T = hash_internal::GeneratedType<TypeParam>; using A = typename TypeParam::allocator_type; - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()}; A alloc(0); TypeParam m(values, 123, alloc); @@ -371,7 +375,7 @@ void InitializerListBucketHashAllocTest(std::true_type) { using A = typename TypeParam::allocator_type; H hasher; A alloc(0); - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()}; TypeParam m(values, 123, hasher, alloc); EXPECT_EQ(m.hash_function(), hasher); @@ -392,7 +396,7 @@ TYPED_TEST_P(ConstructorTest, Assignment) { H hasher; E equal; A alloc(0); - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; TypeParam m({gen(), gen(), gen()}, 123, hasher, equal, alloc); TypeParam n; n = m; @@ -412,7 +416,7 @@ TYPED_TEST_P(ConstructorTest, MoveAssignment) { H hasher; E equal; A alloc(0); - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; TypeParam m({gen(), gen(), gen()}, 123, hasher, equal, alloc); TypeParam t(m); TypeParam n; @@ -424,7 +428,7 @@ TYPED_TEST_P(ConstructorTest, MoveAssignment) { TYPED_TEST_P(ConstructorTest, AssignmentFromInitializerList) { using T = hash_internal::GeneratedType<TypeParam>; - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()}; TypeParam m; m = values; @@ -433,7 +437,7 @@ TYPED_TEST_P(ConstructorTest, AssignmentFromInitializerList) { TYPED_TEST_P(ConstructorTest, AssignmentOverwritesExisting) { using T = hash_internal::GeneratedType<TypeParam>; - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; TypeParam m({gen(), gen(), gen()}); TypeParam n({gen()}); n = m; @@ -442,7 +446,7 @@ TYPED_TEST_P(ConstructorTest, AssignmentOverwritesExisting) { TYPED_TEST_P(ConstructorTest, MoveAssignmentOverwritesExisting) { using T = hash_internal::GeneratedType<TypeParam>; - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; TypeParam m({gen(), gen(), gen()}); TypeParam t(m); TypeParam n({gen()}); @@ -452,7 +456,7 @@ TYPED_TEST_P(ConstructorTest, MoveAssignmentOverwritesExisting) { TYPED_TEST_P(ConstructorTest, AssignmentFromInitializerListOverwritesExisting) { using T = hash_internal::GeneratedType<TypeParam>; - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()}; TypeParam m; m = values; @@ -461,7 +465,7 @@ TYPED_TEST_P(ConstructorTest, AssignmentFromInitializerListOverwritesExisting) { TYPED_TEST_P(ConstructorTest, AssignmentOnSelf) { using T = hash_internal::GeneratedType<TypeParam>; - hash_internal::Generator<T> gen; + hash_internal::UniqueGenerator<T> gen; std::initializer_list<T> values = {gen(), gen(), gen(), gen(), gen()}; TypeParam m(values); m = *&m; // Avoid -Wself-assign diff --git a/third_party/abseil-cpp/absl/container/internal/unordered_map_modifiers_test.h b/third_party/abseil-cpp/absl/container/internal/unordered_map_modifiers_test.h index 8c9ca779a4..d3543936f7 100644 --- a/third_party/abseil-cpp/absl/container/internal/unordered_map_modifiers_test.h +++ b/third_party/abseil-cpp/absl/container/internal/unordered_map_modifiers_test.h @@ -81,6 +81,38 @@ TYPED_TEST_P(ModifiersTest, InsertRange) { ASSERT_THAT(items(m), ::testing::UnorderedElementsAreArray(values)); } +TYPED_TEST_P(ModifiersTest, InsertWithinCapacity) { + using T = hash_internal::GeneratedType<TypeParam>; + using V = typename TypeParam::mapped_type; + T val = hash_internal::Generator<T>()(); + TypeParam m; + m.reserve(10); + const size_t original_capacity = m.bucket_count(); + m.insert(val); + EXPECT_EQ(m.bucket_count(), original_capacity); + T val2 = {val.first, hash_internal::Generator<V>()()}; + m.insert(val2); + EXPECT_EQ(m.bucket_count(), original_capacity); +} + +TYPED_TEST_P(ModifiersTest, InsertRangeWithinCapacity) { +#if !defined(__GLIBCXX__) + using T = hash_internal::GeneratedType<TypeParam>; + std::vector<T> base_values; + std::generate_n(std::back_inserter(base_values), 10, + hash_internal::Generator<T>()); + std::vector<T> values; + while (values.size() != 100) { + std::copy_n(base_values.begin(), 10, std::back_inserter(values)); + } + TypeParam m; + m.reserve(10); + const size_t original_capacity = m.bucket_count(); + m.insert(values.begin(), values.end()); + EXPECT_EQ(m.bucket_count(), original_capacity); +#endif +} + TYPED_TEST_P(ModifiersTest, InsertOrAssign) { #ifdef UNORDERED_MAP_CXX17 using std::get; @@ -266,9 +298,10 @@ TYPED_TEST_P(ModifiersTest, Swap) { // TODO(alkis): Write tests for merge. REGISTER_TYPED_TEST_CASE_P(ModifiersTest, Clear, Insert, InsertHint, - InsertRange, InsertOrAssign, InsertOrAssignHint, - Emplace, EmplaceHint, TryEmplace, TryEmplaceHint, - Erase, EraseRange, EraseKey, Swap); + InsertRange, InsertWithinCapacity, + InsertRangeWithinCapacity, InsertOrAssign, + InsertOrAssignHint, Emplace, EmplaceHint, TryEmplace, + TryEmplaceHint, Erase, EraseRange, EraseKey, Swap); template <typename Type> struct is_unique_ptr : std::false_type {}; diff --git a/third_party/abseil-cpp/absl/container/internal/unordered_set_modifiers_test.h b/third_party/abseil-cpp/absl/container/internal/unordered_set_modifiers_test.h index 26be58d99f..6e473e45da 100644 --- a/third_party/abseil-cpp/absl/container/internal/unordered_set_modifiers_test.h +++ b/third_party/abseil-cpp/absl/container/internal/unordered_set_modifiers_test.h @@ -74,6 +74,36 @@ TYPED_TEST_P(ModifiersTest, InsertRange) { ASSERT_THAT(keys(m), ::testing::UnorderedElementsAreArray(values)); } +TYPED_TEST_P(ModifiersTest, InsertWithinCapacity) { + using T = hash_internal::GeneratedType<TypeParam>; + T val = hash_internal::Generator<T>()(); + TypeParam m; + m.reserve(10); + const size_t original_capacity = m.bucket_count(); + m.insert(val); + EXPECT_EQ(m.bucket_count(), original_capacity); + m.insert(val); + EXPECT_EQ(m.bucket_count(), original_capacity); +} + +TYPED_TEST_P(ModifiersTest, InsertRangeWithinCapacity) { +#if !defined(__GLIBCXX__) + using T = hash_internal::GeneratedType<TypeParam>; + std::vector<T> base_values; + std::generate_n(std::back_inserter(base_values), 10, + hash_internal::Generator<T>()); + std::vector<T> values; + while (values.size() != 100) { + values.insert(values.end(), base_values.begin(), base_values.end()); + } + TypeParam m; + m.reserve(10); + const size_t original_capacity = m.bucket_count(); + m.insert(values.begin(), values.end()); + EXPECT_EQ(m.bucket_count(), original_capacity); +#endif +} + TYPED_TEST_P(ModifiersTest, Emplace) { using T = hash_internal::GeneratedType<TypeParam>; T val = hash_internal::Generator<T>()(); @@ -180,8 +210,9 @@ TYPED_TEST_P(ModifiersTest, Swap) { // TODO(alkis): Write tests for merge. REGISTER_TYPED_TEST_CASE_P(ModifiersTest, Clear, Insert, InsertHint, - InsertRange, Emplace, EmplaceHint, Erase, EraseRange, - EraseKey, Swap); + InsertRange, InsertWithinCapacity, + InsertRangeWithinCapacity, Emplace, EmplaceHint, + Erase, EraseRange, EraseKey, Swap); } // namespace container_internal ABSL_NAMESPACE_END diff --git a/third_party/abseil-cpp/absl/copts/AbseilConfigureCopts.cmake b/third_party/abseil-cpp/absl/copts/AbseilConfigureCopts.cmake index 9cd6fd1b2a..942ce90a4d 100644 --- a/third_party/abseil-cpp/absl/copts/AbseilConfigureCopts.cmake +++ b/third_party/abseil-cpp/absl/copts/AbseilConfigureCopts.cmake @@ -35,8 +35,7 @@ endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(ABSL_DEFAULT_COPTS "${ABSL_GCC_FLAGS}") set(ABSL_TEST_COPTS "${ABSL_GCC_FLAGS};${ABSL_GCC_TEST_FLAGS}") -elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - # MATCHES so we get both Clang and AppleClang +elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") # MATCHES so we get both Clang and AppleClang if(MSVC) # clang-cl is half MSVC, half LLVM set(ABSL_DEFAULT_COPTS "${ABSL_CLANG_CL_FLAGS}") diff --git a/third_party/abseil-cpp/absl/copts/GENERATED_AbseilCopts.cmake b/third_party/abseil-cpp/absl/copts/GENERATED_AbseilCopts.cmake index 51742c9b6b..22a25eba7f 100644 --- a/third_party/abseil-cpp/absl/copts/GENERATED_AbseilCopts.cmake +++ b/third_party/abseil-cpp/absl/copts/GENERATED_AbseilCopts.cmake @@ -71,12 +71,13 @@ list(APPEND ABSL_LLVM_FLAGS "-Wformat-security" "-Wgnu-redeclared-enum" "-Winfinite-recursion" + "-Winvalid-constexpr" "-Wliteral-conversion" "-Wmissing-declarations" "-Woverlength-strings" "-Wpointer-arith" "-Wself-assign" - "-Wshadow" + "-Wshadow-all" "-Wstring-conversion" "-Wtautological-overlap-compare" "-Wundef" diff --git a/third_party/abseil-cpp/absl/debugging/CMakeLists.txt b/third_party/abseil-cpp/absl/debugging/CMakeLists.txt index 074b44cf17..bb4d4c92da 100644 --- a/third_party/abseil-cpp/absl/debugging/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/debugging/CMakeLists.txt @@ -87,7 +87,7 @@ absl_cc_test( absl::memory absl::raw_logging_internal absl::strings - gmock + GTest::gmock ) absl_cc_library( @@ -141,7 +141,7 @@ absl_cc_test( absl::strings absl::raw_logging_internal Threads::Threads - gmock + GTest::gmock ) absl_cc_library( @@ -194,7 +194,7 @@ absl_cc_test( absl::core_headers absl::memory absl::raw_logging_internal - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -261,7 +261,7 @@ absl_cc_test( DEPS absl::leak_check_api_enabled_for_testing absl::base - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -275,7 +275,7 @@ absl_cc_test( DEPS absl::leak_check_api_disabled_for_testing absl::base - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -292,7 +292,7 @@ absl_cc_test( absl::leak_check_disable absl::base absl::raw_logging_internal - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -322,7 +322,7 @@ absl_cc_test( absl::stack_consumption absl::core_headers absl::raw_logging_internal - gmock_main + GTest::gmock_main ) # component target diff --git a/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc b/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc index a9ed6ef964..689e5979e7 100644 --- a/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc +++ b/third_party/abseil-cpp/absl/debugging/failure_signal_handler.cc @@ -136,7 +136,8 @@ static bool SetupAlternateStackOnce() { #else const size_t page_mask = sysconf(_SC_PAGESIZE) - 1; #endif - size_t stack_size = (std::max(SIGSTKSZ, 65536) + page_mask) & ~page_mask; + size_t stack_size = + (std::max<size_t>(SIGSTKSZ, 65536) + page_mask) & ~page_mask; #if defined(ABSL_HAVE_ADDRESS_SANITIZER) || \ defined(ABSL_HAVE_MEMORY_SANITIZER) || defined(ABSL_HAVE_THREAD_SANITIZER) // Account for sanitizer instrumentation requiring additional stack space. @@ -366,6 +367,7 @@ static void AbslFailureSignalHandler(int signo, siginfo_t*, void* ucontext) { // goes after this point. if (fsh_options.writerfn != nullptr) { WriteFailureInfo(signo, ucontext, my_cpu, fsh_options.writerfn); + fsh_options.writerfn(nullptr); } if (fsh_options.call_previous_handler) { diff --git a/third_party/abseil-cpp/absl/debugging/failure_signal_handler.h b/third_party/abseil-cpp/absl/debugging/failure_signal_handler.h index 0c0f585d0f..500115c0ab 100644 --- a/third_party/abseil-cpp/absl/debugging/failure_signal_handler.h +++ b/third_party/abseil-cpp/absl/debugging/failure_signal_handler.h @@ -90,7 +90,7 @@ struct FailureSignalHandlerOptions { // If non-null, indicates a pointer to a callback function that will be called // upon failure, with a string argument containing failure data. This function // may be used as a hook to write failure data to a secondary location, such - // as a log file. This function may also be called with null data, as a hint + // as a log file. This function will also be called with null data, as a hint // to flush any buffered data before the program may be terminated. Consider // flushing any buffered data in all calls to this function. // diff --git a/third_party/abseil-cpp/absl/debugging/internal/demangle.cc b/third_party/abseil-cpp/absl/debugging/internal/demangle.cc index 46cdb67b1f..5cd563208e 100644 --- a/third_party/abseil-cpp/absl/debugging/internal/demangle.cc +++ b/third_party/abseil-cpp/absl/debugging/internal/demangle.cc @@ -386,24 +386,28 @@ static bool IsDigit(char c) { return c >= '0' && c <= '9'; } // by GCC 4.5.x and later versions (and our locally-modified version of GCC // 4.4.x) to indicate functions which have been cloned during optimization. // We treat any sequence (.<alpha>+.<digit>+)+ as a function clone suffix. +// Additionally, '_' is allowed along with the alphanumeric sequence. static bool IsFunctionCloneSuffix(const char *str) { size_t i = 0; while (str[i] != '\0') { - // Consume a single .<alpha>+.<digit>+ sequence. - if (str[i] != '.' || !IsAlpha(str[i + 1])) { - return false; + bool parsed = false; + // Consume a single [.<alpha> | _]*[.<digit>]* sequence. + if (str[i] == '.' && (IsAlpha(str[i + 1]) || str[i + 1] == '_')) { + parsed = true; + i += 2; + while (IsAlpha(str[i]) || str[i] == '_') { + ++i; + } } - i += 2; - while (IsAlpha(str[i])) { - ++i; + if (str[i] == '.' && IsDigit(str[i + 1])) { + parsed = true; + i += 2; + while (IsDigit(str[i])) { + ++i; + } } - if (str[i] != '.' || !IsDigit(str[i + 1])) { + if (!parsed) return false; - } - i += 2; - while (IsDigit(str[i])) { - ++i; - } } return true; // Consumed everything in "str". } diff --git a/third_party/abseil-cpp/absl/debugging/internal/demangle_test.cc b/third_party/abseil-cpp/absl/debugging/internal/demangle_test.cc index 0bed7359d8..6b142902ca 100644 --- a/third_party/abseil-cpp/absl/debugging/internal/demangle_test.cc +++ b/third_party/abseil-cpp/absl/debugging/internal/demangle_test.cc @@ -70,12 +70,34 @@ TEST(Demangle, Clones) { EXPECT_STREQ("Foo()", tmp); EXPECT_TRUE(Demangle("_ZL3Foov.isra.2.constprop.18", tmp, sizeof(tmp))); EXPECT_STREQ("Foo()", tmp); - // Invalid (truncated), should not demangle. - EXPECT_FALSE(Demangle("_ZL3Foov.clo", tmp, sizeof(tmp))); + // Demangle suffixes produced by -funique-internal-linkage-names. + EXPECT_TRUE(Demangle("_ZL3Foov.__uniq.12345", tmp, sizeof(tmp))); + EXPECT_STREQ("Foo()", tmp); + EXPECT_TRUE(Demangle("_ZL3Foov.__uniq.12345.isra.2.constprop.18", tmp, + sizeof(tmp))); + EXPECT_STREQ("Foo()", tmp); + // Suffixes without the number should also demangle. + EXPECT_TRUE(Demangle("_ZL3Foov.clo", tmp, sizeof(tmp))); + EXPECT_STREQ("Foo()", tmp); + // Suffixes with just the number should also demangle. + EXPECT_TRUE(Demangle("_ZL3Foov.123", tmp, sizeof(tmp))); + EXPECT_STREQ("Foo()", tmp); + // (.clone. followed by non-number), should also demangle. + EXPECT_TRUE(Demangle("_ZL3Foov.clone.foo", tmp, sizeof(tmp))); + EXPECT_STREQ("Foo()", tmp); + // (.clone. followed by multiple numbers), should also demangle. + EXPECT_TRUE(Demangle("_ZL3Foov.clone.123.456", tmp, sizeof(tmp))); + EXPECT_STREQ("Foo()", tmp); + // (a long valid suffix), should demangle. + EXPECT_TRUE(Demangle("_ZL3Foov.part.9.165493.constprop.775.31805", tmp, + sizeof(tmp))); + EXPECT_STREQ("Foo()", tmp); + // Invalid (. without anything else), should not demangle. + EXPECT_FALSE(Demangle("_ZL3Foov.", tmp, sizeof(tmp))); + // Invalid (. with mix of alpha and digits), should not demangle. + EXPECT_FALSE(Demangle("_ZL3Foov.abc123", tmp, sizeof(tmp))); // Invalid (.clone. not followed by number), should not demangle. EXPECT_FALSE(Demangle("_ZL3Foov.clone.", tmp, sizeof(tmp))); - // Invalid (.clone. followed by non-number), should not demangle. - EXPECT_FALSE(Demangle("_ZL3Foov.clone.foo", tmp, sizeof(tmp))); // Invalid (.constprop. not followed by number), should not demangle. EXPECT_FALSE(Demangle("_ZL3Foov.isra.2.constprop.", tmp, sizeof(tmp))); } diff --git a/third_party/abseil-cpp/absl/debugging/internal/stacktrace_x86-inl.inc b/third_party/abseil-cpp/absl/debugging/internal/stacktrace_x86-inl.inc index bc320ff75b..70f79dfcb8 100644 --- a/third_party/abseil-cpp/absl/debugging/internal/stacktrace_x86-inl.inc +++ b/third_party/abseil-cpp/absl/debugging/internal/stacktrace_x86-inl.inc @@ -132,9 +132,8 @@ static uintptr_t GetFP(const void *vuc) { const uintptr_t bp = 0; const uintptr_t sp = 0; #endif - // Sanity-check that the base pointer is valid. It should be as long as - // SHRINK_WRAP_FRAME_POINTER is not set, but it's possible that some code in - // the process is compiled with --copt=-fomit-frame-pointer or + // Sanity-check that the base pointer is valid. It's possible that some + // code in the process is compiled with --copt=-fomit-frame-pointer or // --copt=-momit-leaf-frame-pointer. // // TODO(bcmills): -momit-leaf-frame-pointer is currently the default @@ -247,7 +246,7 @@ static void **NextStackFrame(void **old_fp, const void *uc) { // using an alternate signal stack. // // TODO(bcmills): The GetFP call should be completely unnecessary when - // SHRINK_WRAP_FRAME_POINTER is set (because we should be back in the thread's + // ENABLE_COMBINED_UNWINDER is set (because we should be back in the thread's // stack by this point), but it is empirically still needed (e.g. when the // stack includes a call to abort). unw_get_reg returns UNW_EBADREG for some // frames. Figure out why GetValidFrameAddr and/or libunwind isn't doing what diff --git a/third_party/abseil-cpp/absl/debugging/leak_check.cc b/third_party/abseil-cpp/absl/debugging/leak_check.cc index ff9049559d..764ca0ad00 100644 --- a/third_party/abseil-cpp/absl/debugging/leak_check.cc +++ b/third_party/abseil-cpp/absl/debugging/leak_check.cc @@ -16,6 +16,7 @@ // When lsan is not linked in, these functions are not available, // therefore Abseil code which depends on these functions is conditioned on the // definition of LEAK_SANITIZER. +#include "absl/base/attributes.h" #include "absl/debugging/leak_check.h" #ifndef LEAK_SANITIZER @@ -23,6 +24,7 @@ namespace absl { ABSL_NAMESPACE_BEGIN bool HaveLeakSanitizer() { return false; } +bool LeakCheckerIsActive() { return false; } void DoIgnoreLeak(const void*) { } void RegisterLivePointers(const void*, size_t) { } void UnRegisterLivePointers(const void*, size_t) { } @@ -35,9 +37,23 @@ ABSL_NAMESPACE_END #include <sanitizer/lsan_interface.h> +#if ABSL_HAVE_ATTRIBUTE_WEAK +extern "C" ABSL_ATTRIBUTE_WEAK int __lsan_is_turned_off(); +#endif + namespace absl { ABSL_NAMESPACE_BEGIN bool HaveLeakSanitizer() { return true; } + +#if ABSL_HAVE_ATTRIBUTE_WEAK +bool LeakCheckerIsActive() { + return !(&__lsan_is_turned_off && __lsan_is_turned_off()); +} +#else +bool LeakCheckerIsActive() { return true; } +#endif + +bool FindAndReportLeaks() { return __lsan_do_recoverable_leak_check(); } void DoIgnoreLeak(const void* ptr) { __lsan_ignore_object(ptr); } void RegisterLivePointers(const void* ptr, size_t size) { __lsan_register_root_region(ptr, size); diff --git a/third_party/abseil-cpp/absl/debugging/leak_check.h b/third_party/abseil-cpp/absl/debugging/leak_check.h index b66a81c3bc..5fc2b052e4 100644 --- a/third_party/abseil-cpp/absl/debugging/leak_check.h +++ b/third_party/abseil-cpp/absl/debugging/leak_check.h @@ -43,6 +43,12 @@ ABSL_NAMESPACE_BEGIN // currently built into this target. bool HaveLeakSanitizer(); +// LeakCheckerIsActive() +// +// Returns true if a leak-checking sanitizer (either ASan or standalone LSan) is +// currently built into this target and is turned on. +bool LeakCheckerIsActive(); + // DoIgnoreLeak() // // Implements `IgnoreLeak()` below. This function should usually @@ -71,6 +77,19 @@ T* IgnoreLeak(T* ptr) { return ptr; } +// FindAndReportLeaks() +// +// If any leaks are detected, prints a leak report and returns true. This +// function may be called repeatedly, and does not affect end-of-process leak +// checking. +// +// Example: +// if (FindAndReportLeaks()) { +// ... diagnostic already printed. Exit with failure code. +// exit(1) +// } +bool FindAndReportLeaks(); + // LeakCheckDisabler // // This helper class indicates that any heap allocations done in the code block diff --git a/third_party/abseil-cpp/absl/debugging/leak_check_test.cc b/third_party/abseil-cpp/absl/debugging/leak_check_test.cc index b5cc487488..9fcfc8e50b 100644 --- a/third_party/abseil-cpp/absl/debugging/leak_check_test.cc +++ b/third_party/abseil-cpp/absl/debugging/leak_check_test.cc @@ -23,8 +23,10 @@ namespace { TEST(LeakCheckTest, DetectLeakSanitizer) { #ifdef ABSL_EXPECT_LEAK_SANITIZER EXPECT_TRUE(absl::HaveLeakSanitizer()); + EXPECT_TRUE(absl::LeakCheckerIsActive()); #else EXPECT_FALSE(absl::HaveLeakSanitizer()); + EXPECT_FALSE(absl::LeakCheckerIsActive()); #endif } diff --git a/third_party/abseil-cpp/absl/debugging/symbolize_elf.inc b/third_party/abseil-cpp/absl/debugging/symbolize_elf.inc index f4d5727bde..87dbd078b9 100644 --- a/third_party/abseil-cpp/absl/debugging/symbolize_elf.inc +++ b/third_party/abseil-cpp/absl/debugging/symbolize_elf.inc @@ -701,6 +701,16 @@ static ABSL_ATTRIBUTE_NOINLINE FindSymbolResult FindSymbol( const char *start_address = ComputeOffset(original_start_address, relocation); +#ifdef __arm__ + // ARM functions are always aligned to multiples of two bytes; the + // lowest-order bit in start_address is ignored by the CPU and indicates + // whether the function contains ARM (0) or Thumb (1) code. We don't care + // about what encoding is being used; we just want the real start address + // of the function. + start_address = reinterpret_cast<const char *>( + reinterpret_cast<uintptr_t>(start_address) & ~1); +#endif + if (deref_function_descriptor_pointer && InSection(original_start_address, opd)) { // The opd section is mapped into memory. Just dereference diff --git a/third_party/abseil-cpp/absl/debugging/symbolize_test.cc b/third_party/abseil-cpp/absl/debugging/symbolize_test.cc index a2dd4956c4..35de02e24b 100644 --- a/third_party/abseil-cpp/absl/debugging/symbolize_test.cc +++ b/third_party/abseil-cpp/absl/debugging/symbolize_test.cc @@ -477,6 +477,46 @@ void ABSL_ATTRIBUTE_NOINLINE TestWithReturnAddress() { #endif } +#if defined(__arm__) && ABSL_HAVE_ATTRIBUTE(target) +// Test that we correctly identify bounds of Thumb functions on ARM. +// +// Thumb functions have the lowest-order bit set in their addresses in the ELF +// symbol table. This requires some extra logic to properly compute function +// bounds. To test this logic, nudge a Thumb function right up against an ARM +// function and try to symbolize the ARM function. +// +// A naive implementation will simply use the Thumb function's entry point as +// written in the symbol table and will therefore treat the Thumb function as +// extending one byte further in the instruction stream than it actually does. +// When asked to symbolize the start of the ARM function, it will identify an +// overlap between the Thumb and ARM functions, and it will return the name of +// the Thumb function. +// +// A correct implementation, on the other hand, will null out the lowest-order +// bit in the Thumb function's entry point. It will correctly compute the end of +// the Thumb function, it will find no overlap between the Thumb and ARM +// functions, and it will return the name of the ARM function. + +__attribute__((target("thumb"))) int ArmThumbOverlapThumb(int x) { + return x * x * x; +} + +__attribute__((target("arm"))) int ArmThumbOverlapArm(int x) { + return x * x * x; +} + +void ABSL_ATTRIBUTE_NOINLINE TestArmThumbOverlap() { +#if defined(ABSL_HAVE_ATTRIBUTE_NOINLINE) + const char *symbol = TrySymbolize((void *)&ArmThumbOverlapArm); + ABSL_RAW_CHECK(symbol != nullptr, "TestArmThumbOverlap failed"); + ABSL_RAW_CHECK(strcmp("ArmThumbOverlapArm()", symbol) == 0, + "TestArmThumbOverlap failed"); + std::cout << "TestArmThumbOverlap passed" << std::endl; +#endif +} + +#endif // defined(__arm__) && ABSL_HAVE_ATTRIBUTE(target) + #elif defined(_WIN32) #if !defined(ABSL_CONSUME_DLL) @@ -551,6 +591,9 @@ int main(int argc, char **argv) { TestWithPCInsideInlineFunction(); TestWithPCInsideNonInlineFunction(); TestWithReturnAddress(); +#if defined(__arm__) && ABSL_HAVE_ATTRIBUTE(target) + TestArmThumbOverlap(); +#endif #endif return RUN_ALL_TESTS(); diff --git a/third_party/abseil-cpp/absl/flags/CMakeLists.txt b/third_party/abseil-cpp/absl/flags/CMakeLists.txt index caac69cf89..956f70f868 100644 --- a/third_party/abseil-cpp/absl/flags/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/flags/CMakeLists.txt @@ -239,6 +239,7 @@ absl_cc_library( absl::flags_private_handle_accessor absl::flags_program_name absl::flags_reflection + absl::flat_hash_map absl::strings absl::synchronization ) @@ -309,7 +310,7 @@ absl_cc_test( absl::flags_reflection absl::memory absl::strings - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -321,7 +322,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::flags_config - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -341,7 +342,7 @@ absl_cc_test( absl::flags_reflection absl::strings absl::time - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -353,7 +354,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::flags_marshalling - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -372,7 +373,7 @@ absl_cc_test( absl::scoped_set_env absl::span absl::strings - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -384,7 +385,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::flags_path_util - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -397,7 +398,7 @@ absl_cc_test( DEPS absl::flags_program_name absl::strings - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -414,7 +415,7 @@ absl_cc_test( absl::flags_usage absl::memory absl::strings - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -428,7 +429,7 @@ absl_cc_test( absl::base absl::flags_internal absl::time - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -443,7 +444,7 @@ absl_cc_test( absl::flags_path_util absl::flags_program_name absl::strings - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -462,5 +463,5 @@ absl_cc_test( absl::flags_reflection absl::flags_usage absl::strings - gtest + GTest::gtest ) diff --git a/third_party/abseil-cpp/absl/flags/flag.h b/third_party/abseil-cpp/absl/flags/flag.h index f09580b06a..14209e7ba7 100644 --- a/third_party/abseil-cpp/absl/flags/flag.h +++ b/third_party/abseil-cpp/absl/flags/flag.h @@ -265,6 +265,8 @@ ABSL_NAMESPACE_END // // ABSL_FLAG(T, name, default_value, help).OnUpdate(callback); // +// `callback` should be convertible to `void (*)()`. +// // After any setting of the flag value, the callback will be called at least // once. A rapid sequence of changes may be merged together into the same // callback. No concurrent calls to the callback will be made for the same @@ -279,7 +281,6 @@ ABSL_NAMESPACE_END // Note: ABSL_FLAG.OnUpdate() does not have a public definition. Hence, this // comment serves as its API documentation. - // ----------------------------------------------------------------------------- // Implementation details below this section // ----------------------------------------------------------------------------- diff --git a/third_party/abseil-cpp/absl/flags/internal/usage.cc b/third_party/abseil-cpp/absl/flags/internal/usage.cc index a588c7f73a..949709e883 100644 --- a/third_party/abseil-cpp/absl/flags/internal/usage.cc +++ b/third_party/abseil-cpp/absl/flags/internal/usage.cc @@ -245,7 +245,7 @@ void FlagsHelpImpl(std::ostream& out, PerFlagFilter filter_cb, << XMLElement("usage", program_usage_message) << '\n'; } - // Map of package name to + // Ordered map of package name to // map of file name to // vector of flags in the file. // This map is used to output matching flags grouped by package and file @@ -273,20 +273,26 @@ void FlagsHelpImpl(std::ostream& out, PerFlagFilter filter_cb, absl::string_view package_separator; // controls blank lines between packages absl::string_view file_separator; // controls blank lines between files - for (const auto& package : matching_flags) { + for (auto& package : matching_flags) { if (format == HelpFormat::kHumanReadable) { out << package_separator; package_separator = "\n\n"; } file_separator = ""; - for (const auto& flags_in_file : package.second) { + for (auto& flags_in_file : package.second) { if (format == HelpFormat::kHumanReadable) { out << file_separator << " Flags from " << flags_in_file.first << ":\n"; file_separator = "\n"; } + std::sort(std::begin(flags_in_file.second), + std::end(flags_in_file.second), + [](const CommandLineFlag* lhs, const CommandLineFlag* rhs) { + return lhs->Name() < rhs->Name(); + }); + for (const auto* flag : flags_in_file.second) { flags_internal::FlagHelp(out, *flag, format); } diff --git a/third_party/abseil-cpp/absl/flags/reflection.cc b/third_party/abseil-cpp/absl/flags/reflection.cc index 0c76110163..dbce4032ab 100644 --- a/third_party/abseil-cpp/absl/flags/reflection.cc +++ b/third_party/abseil-cpp/absl/flags/reflection.cc @@ -18,11 +18,11 @@ #include <assert.h> #include <atomic> -#include <map> #include <string> #include "absl/base/config.h" #include "absl/base/thread_annotations.h" +#include "absl/container/flat_hash_map.h" #include "absl/flags/commandlineflag.h" #include "absl/flags/internal/private_handle_accessor.h" #include "absl/flags/internal/registry.h" @@ -68,7 +68,7 @@ class FlagRegistry { friend void FinalizeRegistry(); // The map from name to flag, for FindFlag(). - using FlagMap = std::map<absl::string_view, CommandLineFlag*>; + using FlagMap = absl::flat_hash_map<absl::string_view, CommandLineFlag*>; using FlagIterator = FlagMap::iterator; using FlagConstIterator = FlagMap::const_iterator; FlagMap flags_; @@ -204,6 +204,10 @@ void FinalizeRegistry() { for (const auto& f : registry.flags_) { registry.flat_flags_.push_back(f.second); } + std::sort(std::begin(registry.flat_flags_), std::end(registry.flat_flags_), + [](const CommandLineFlag* lhs, const CommandLineFlag* rhs) { + return lhs->Name() < rhs->Name(); + }); registry.flags_.clear(); registry.finalized_flags_.store(true, std::memory_order_release); } diff --git a/third_party/abseil-cpp/absl/functional/CMakeLists.txt b/third_party/abseil-cpp/absl/functional/CMakeLists.txt index cda914f2cd..3919e9a1de 100644 --- a/third_party/abseil-cpp/absl/functional/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/functional/CMakeLists.txt @@ -39,7 +39,7 @@ absl_cc_test( DEPS absl::bind_front absl::memory - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -68,5 +68,5 @@ absl_cc_test( absl::function_ref absl::memory absl::test_instance_tracker - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/functional/function_ref.h b/third_party/abseil-cpp/absl/functional/function_ref.h index 6e03ac2e04..5790a65251 100644 --- a/third_party/abseil-cpp/absl/functional/function_ref.h +++ b/third_party/abseil-cpp/absl/functional/function_ref.h @@ -122,6 +122,7 @@ class FunctionRef<R(Args...)> { // To help prevent subtle lifetime bugs, FunctionRef is not assignable. // Typically, it should only be used as an argument type. FunctionRef& operator=(const FunctionRef& rhs) = delete; + FunctionRef(const FunctionRef& rhs) = default; // Call the underlying object. R operator()(Args... args) const { diff --git a/third_party/abseil-cpp/absl/hash/CMakeLists.txt b/third_party/abseil-cpp/absl/hash/CMakeLists.txt index b43bfa542f..c82f66f02c 100644 --- a/third_party/abseil-cpp/absl/hash/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/hash/CMakeLists.txt @@ -52,7 +52,7 @@ absl_cc_library( absl::meta absl::strings absl::variant - gmock + GTest::gmock TESTONLY ) @@ -72,7 +72,7 @@ absl_cc_test( absl::spy_hash_state absl::meta absl::int128 - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -113,7 +113,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::city - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -141,5 +141,5 @@ absl_cc_test( DEPS absl::wyhash absl::strings - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/hash/hash.h b/third_party/abseil-cpp/absl/hash/hash.h index 5de132cac8..8282ea53c6 100644 --- a/third_party/abseil-cpp/absl/hash/hash.h +++ b/third_party/abseil-cpp/absl/hash/hash.h @@ -73,6 +73,8 @@ #ifndef ABSL_HASH_HASH_H_ #define ABSL_HASH_HASH_H_ +#include <tuple> + #include "absl/hash/internal/hash.h" namespace absl { @@ -214,6 +216,26 @@ ABSL_NAMESPACE_BEGIN template <typename T> using Hash = absl::hash_internal::Hash<T>; +// HashOf +// +// absl::HashOf() is a helper that generates a hash from the values of its +// arguments. It dispatches to absl::Hash directly, as follows: +// * HashOf(t) == absl::Hash<T>{}(t) +// * HashOf(a, b, c) == HashOf(std::make_tuple(a, b, c)) +// +// HashOf(a1, a2, ...) == HashOf(b1, b2, ...) is guaranteed when +// * The argument lists have pairwise identical C++ types +// * a1 == b1 && a2 == b2 && ... +// +// The requirement that the arguments match in both type and value is critical. +// It means that `a == b` does not necessarily imply `HashOf(a) == HashOf(b)` if +// `a` and `b` have different types. For example, `HashOf(2) != HashOf(2.0)`. +template <int&... ExplicitArgumentBarrier, typename... Types> +size_t HashOf(const Types&... values) { + auto tuple = std::tie(values...); + return absl::Hash<decltype(tuple)>{}(tuple); +} + // HashState // // A type erased version of the hash state concept, for use in user-defined diff --git a/third_party/abseil-cpp/absl/hash/hash_test.cc b/third_party/abseil-cpp/absl/hash/hash_test.cc index 1d2e6cf0df..b3ddebdd42 100644 --- a/third_party/abseil-cpp/absl/hash/hash_test.cc +++ b/third_party/abseil-cpp/absl/hash/hash_test.cc @@ -973,4 +973,39 @@ TEST(HashTest, DoesNotUseImplicitConversionsToBool) { absl::Hash<ValueWithBoolConversion>()(ValueWithBoolConversion{1})); } +TEST(HashOf, MatchesHashForSingleArgument) { + std::string s = "forty two"; + int i = 42; + double d = 42.0; + std::tuple<int, int> t{4, 2}; + + EXPECT_EQ(absl::HashOf(s), absl::Hash<std::string>{}(s)); + EXPECT_EQ(absl::HashOf(i), absl::Hash<int>{}(i)); + EXPECT_EQ(absl::HashOf(d), absl::Hash<double>{}(d)); + EXPECT_EQ(absl::HashOf(t), (absl::Hash<std::tuple<int, int>>{}(t))); +} + +TEST(HashOf, MatchesHashOfTupleForMultipleArguments) { + std::string hello = "hello"; + std::string world = "world"; + + EXPECT_EQ(absl::HashOf(), absl::HashOf(std::make_tuple())); + EXPECT_EQ(absl::HashOf(hello), absl::HashOf(std::make_tuple(hello))); + EXPECT_EQ(absl::HashOf(hello, world), + absl::HashOf(std::make_tuple(hello, world))); +} + +template <typename T> +std::true_type HashOfExplicitParameter(decltype(absl::HashOf<T>(0))) { + return {}; +} +template <typename T> +std::false_type HashOfExplicitParameter(size_t) { + return {}; +} + +TEST(HashOf, CantPassExplicitTemplateParameters) { + EXPECT_FALSE(HashOfExplicitParameter<int>(0)); +} + } // namespace diff --git a/third_party/abseil-cpp/absl/hash/internal/hash.cc b/third_party/abseil-cpp/absl/hash/internal/hash.cc index 1433eb9db3..06f53a59c5 100644 --- a/third_party/abseil-cpp/absl/hash/internal/hash.cc +++ b/third_party/abseil-cpp/absl/hash/internal/hash.cc @@ -18,9 +18,8 @@ namespace absl { ABSL_NAMESPACE_BEGIN namespace hash_internal { -uint64_t HashState::CombineLargeContiguousImpl32(uint64_t state, - const unsigned char* first, - size_t len) { +uint64_t MixingHashState::CombineLargeContiguousImpl32( + uint64_t state, const unsigned char* first, size_t len) { while (len >= PiecewiseChunkSize()) { state = Mix(state, absl::hash_internal::CityHash32(reinterpret_cast<const char*>(first), @@ -33,9 +32,8 @@ uint64_t HashState::CombineLargeContiguousImpl32(uint64_t state, std::integral_constant<int, 4>{}); } -uint64_t HashState::CombineLargeContiguousImpl64(uint64_t state, - const unsigned char* first, - size_t len) { +uint64_t MixingHashState::CombineLargeContiguousImpl64( + uint64_t state, const unsigned char* first, size_t len) { while (len >= PiecewiseChunkSize()) { state = Mix(state, Hash64(first, PiecewiseChunkSize())); len -= PiecewiseChunkSize(); @@ -46,7 +44,7 @@ uint64_t HashState::CombineLargeContiguousImpl64(uint64_t state, std::integral_constant<int, 8>{}); } -ABSL_CONST_INIT const void* const HashState::kSeed = &kSeed; +ABSL_CONST_INIT const void* const MixingHashState::kSeed = &kSeed; // The salt array used by Wyhash. This array is NOT the mechanism used to make // absl::Hash non-deterministic between program invocations. See `Seed()` for @@ -61,7 +59,7 @@ constexpr uint64_t kWyhashSalt[5] = { uint64_t{0x452821E638D01377}, }; -uint64_t HashState::WyhashImpl(const unsigned char* data, size_t len) { +uint64_t MixingHashState::WyhashImpl(const unsigned char* data, size_t len) { return Wyhash(data, len, Seed(), kWyhashSalt); } diff --git a/third_party/abseil-cpp/absl/hash/internal/hash.h b/third_party/abseil-cpp/absl/hash/internal/hash.h index 7fb0af0b96..69dbbc6ba0 100644 --- a/third_party/abseil-cpp/absl/hash/internal/hash.h +++ b/third_party/abseil-cpp/absl/hash/internal/hash.h @@ -379,7 +379,7 @@ template <typename H, typename... Ts> // This SFINAE gets MSVC confused under some conditions. Let's just disable it // for now. H -#else // _MSC_VER +#else // _MSC_VER typename std::enable_if<absl::conjunction<is_hashable<Ts>...>::value, H>::type #endif // _MSC_VER AbslHashValue(H hash_state, const std::tuple<Ts...>& t) { @@ -714,8 +714,8 @@ template <typename T> struct is_hashable : std::integral_constant<bool, HashSelect::template Apply<T>::value> {}; -// HashState -class ABSL_DLL HashState : public HashStateBase<HashState> { +// MixingHashState +class ABSL_DLL MixingHashState : public HashStateBase<MixingHashState> { // absl::uint128 is not an alias or a thin wrapper around the intrinsic. // We use the intrinsic when available to improve performance. #ifdef ABSL_HAVE_INTRINSIC_INT128 @@ -734,22 +734,23 @@ class ABSL_DLL HashState : public HashStateBase<HashState> { public: // Move only - HashState(HashState&&) = default; - HashState& operator=(HashState&&) = default; + MixingHashState(MixingHashState&&) = default; + MixingHashState& operator=(MixingHashState&&) = default; - // HashState::combine_contiguous() + // MixingHashState::combine_contiguous() // // Fundamental base case for hash recursion: mixes the given range of bytes // into the hash state. - static HashState combine_contiguous(HashState hash_state, - const unsigned char* first, size_t size) { - return HashState( + static MixingHashState combine_contiguous(MixingHashState hash_state, + const unsigned char* first, + size_t size) { + return MixingHashState( CombineContiguousImpl(hash_state.state_, first, size, std::integral_constant<int, sizeof(size_t)>{})); } - using HashState::HashStateBase::combine_contiguous; + using MixingHashState::HashStateBase::combine_contiguous; - // HashState::hash() + // MixingHashState::hash() // // For performance reasons in non-opt mode, we specialize this for // integral types. @@ -761,24 +762,24 @@ class ABSL_DLL HashState : public HashStateBase<HashState> { return static_cast<size_t>(Mix(Seed(), static_cast<uint64_t>(value))); } - // Overload of HashState::hash() + // Overload of MixingHashState::hash() template <typename T, absl::enable_if_t<!IntegralFastPath<T>::value, int> = 0> static size_t hash(const T& value) { - return static_cast<size_t>(combine(HashState{}, value).state_); + return static_cast<size_t>(combine(MixingHashState{}, value).state_); } private: // Invoked only once for a given argument; that plus the fact that this is // move-only ensures that there is only one non-moved-from object. - HashState() : state_(Seed()) {} + MixingHashState() : state_(Seed()) {} // Workaround for MSVC bug. // We make the type copyable to fix the calling convention, even though we // never actually copy it. Keep it private to not affect the public API of the // type. - HashState(const HashState&) = default; + MixingHashState(const MixingHashState&) = default; - explicit HashState(uint64_t state) : state_(state) {} + explicit MixingHashState(uint64_t state) : state_(state) {} // Implementation of the base case for combine_contiguous where we actually // mix the bytes into the state. @@ -793,7 +794,6 @@ class ABSL_DLL HashState : public HashStateBase<HashState> { std::integral_constant<int, 8> /* sizeof_size_t */); - // Slow dispatch path for calls to CombineContiguousImpl with a size argument // larger than PiecewiseChunkSize(). Has the same effect as calling // CombineContiguousImpl() repeatedly with the chunk stride size. @@ -911,8 +911,8 @@ class ABSL_DLL HashState : public HashStateBase<HashState> { uint64_t state_; }; -// HashState::CombineContiguousImpl() -inline uint64_t HashState::CombineContiguousImpl( +// MixingHashState::CombineContiguousImpl() +inline uint64_t MixingHashState::CombineContiguousImpl( uint64_t state, const unsigned char* first, size_t len, std::integral_constant<int, 4> /* sizeof_size_t */) { // For large values we use CityHash, for small ones we just use a @@ -934,8 +934,8 @@ inline uint64_t HashState::CombineContiguousImpl( return Mix(state, v); } -// Overload of HashState::CombineContiguousImpl() -inline uint64_t HashState::CombineContiguousImpl( +// Overload of MixingHashState::CombineContiguousImpl() +inline uint64_t MixingHashState::CombineContiguousImpl( uint64_t state, const unsigned char* first, size_t len, std::integral_constant<int, 8> /* sizeof_size_t */) { // For large values we use Wyhash or CityHash depending on the platform, for @@ -976,7 +976,9 @@ struct PoisonedHash : private AggregateBarrier { template <typename T> struct HashImpl { - size_t operator()(const T& value) const { return HashState::hash(value); } + size_t operator()(const T& value) const { + return MixingHashState::hash(value); + } }; template <typename T> diff --git a/third_party/abseil-cpp/absl/hash/internal/wyhash.h b/third_party/abseil-cpp/absl/hash/internal/wyhash.h index 4aff4e931a..2b534b4706 100644 --- a/third_party/abseil-cpp/absl/hash/internal/wyhash.h +++ b/third_party/abseil-cpp/absl/hash/internal/wyhash.h @@ -36,7 +36,7 @@ namespace hash_internal { // integers are hashed into the result. // // To allow all hashable types (including string_view and Span) to depend on -// this algoritm, we keep the API low-level, with as few dependencies as +// this algorithm, we keep the API low-level, with as few dependencies as // possible. uint64_t Wyhash(const void* data, size_t len, uint64_t seed, const uint64_t salt[5]); diff --git a/third_party/abseil-cpp/absl/memory/CMakeLists.txt b/third_party/abseil-cpp/absl/memory/CMakeLists.txt index 78fb7e1b31..9d50e1dcd4 100644 --- a/third_party/abseil-cpp/absl/memory/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/memory/CMakeLists.txt @@ -37,7 +37,7 @@ absl_cc_test( DEPS absl::memory absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -51,5 +51,5 @@ absl_cc_test( absl::memory absl::config absl::exception_safety_testing - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/memory/memory.h b/third_party/abseil-cpp/absl/memory/memory.h index 2b5ff623d4..d63326068f 100644 --- a/third_party/abseil-cpp/absl/memory/memory.h +++ b/third_party/abseil-cpp/absl/memory/memory.h @@ -420,7 +420,7 @@ struct pointer_traits<T*> { // // A C++11 compatible implementation of C++17's std::allocator_traits. // -#if __cplusplus >= 201703L +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) using std::allocator_traits; #else // __cplusplus >= 201703L template <typename Alloc> diff --git a/third_party/abseil-cpp/absl/meta/CMakeLists.txt b/third_party/abseil-cpp/absl/meta/CMakeLists.txt index 672ead2fd0..9de4bd3751 100644 --- a/third_party/abseil-cpp/absl/meta/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/meta/CMakeLists.txt @@ -35,7 +35,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::type_traits - gmock_main + GTest::gmock_main ) # component target diff --git a/third_party/abseil-cpp/absl/meta/type_traits.h b/third_party/abseil-cpp/absl/meta/type_traits.h index d5cb5f3be3..e7c123936d 100644 --- a/third_party/abseil-cpp/absl/meta/type_traits.h +++ b/third_party/abseil-cpp/absl/meta/type_traits.h @@ -499,6 +499,27 @@ struct is_trivially_copy_assignable #endif // ABSL_HAVE_STD_IS_TRIVIALLY_ASSIGNABLE }; +#if defined(__cpp_lib_remove_cvref) && __cpp_lib_remove_cvref >= 201711L +template <typename T> +using remove_cvref = std::remove_cvref<T>; + +template <typename T> +using remove_cvref_t = typename std::remove_cvref<T>::type; +#else +// remove_cvref() +// +// C++11 compatible implementation of std::remove_cvref which was added in +// C++20. +template <typename T> +struct remove_cvref { + using type = + typename std::remove_cv<typename std::remove_reference<T>::type>::type; +}; + +template <typename T> +using remove_cvref_t = typename remove_cvref<T>::type; +#endif + namespace type_traits_internal { // is_trivially_copyable() // @@ -613,7 +634,7 @@ using underlying_type_t = typename std::underlying_type<T>::type; namespace type_traits_internal { -#if __cplusplus >= 201703L +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) // std::result_of is deprecated (C++17) or removed (C++20) template<typename> struct result_of; template<typename F, typename... Args> diff --git a/third_party/abseil-cpp/absl/meta/type_traits_test.cc b/third_party/abseil-cpp/absl/meta/type_traits_test.cc index 1aafd0d49a..0ef5b66558 100644 --- a/third_party/abseil-cpp/absl/meta/type_traits_test.cc +++ b/third_party/abseil-cpp/absl/meta/type_traits_test.cc @@ -942,6 +942,34 @@ TEST(TypeTraitsTest, TestTriviallyCopyable) { absl::type_traits_internal::is_trivially_copyable<Trivial&>::value); } +TEST(TypeTraitsTest, TestRemoveCVRef) { + EXPECT_TRUE( + (std::is_same<typename absl::remove_cvref<int>::type, int>::value)); + EXPECT_TRUE( + (std::is_same<typename absl::remove_cvref<int&>::type, int>::value)); + EXPECT_TRUE( + (std::is_same<typename absl::remove_cvref<int&&>::type, int>::value)); + EXPECT_TRUE(( + std::is_same<typename absl::remove_cvref<const int&>::type, int>::value)); + EXPECT_TRUE( + (std::is_same<typename absl::remove_cvref<int*>::type, int*>::value)); + // Does not remove const in this case. + EXPECT_TRUE((std::is_same<typename absl::remove_cvref<const int*>::type, + const int*>::value)); + EXPECT_TRUE((std::is_same<typename absl::remove_cvref<int[2]>::type, + int[2]>::value)); + EXPECT_TRUE((std::is_same<typename absl::remove_cvref<int(&)[2]>::type, + int[2]>::value)); + EXPECT_TRUE((std::is_same<typename absl::remove_cvref<int(&&)[2]>::type, + int[2]>::value)); + EXPECT_TRUE((std::is_same<typename absl::remove_cvref<const int[2]>::type, + int[2]>::value)); + EXPECT_TRUE((std::is_same<typename absl::remove_cvref<const int(&)[2]>::type, + int[2]>::value)); + EXPECT_TRUE((std::is_same<typename absl::remove_cvref<const int(&&)[2]>::type, + int[2]>::value)); +} + #define ABSL_INTERNAL_EXPECT_ALIAS_EQUIVALENCE(trait_name, ...) \ EXPECT_TRUE((std::is_same<typename std::trait_name<__VA_ARGS__>::type, \ absl::trait_name##_t<__VA_ARGS__>>::value)) diff --git a/third_party/abseil-cpp/absl/numeric/CMakeLists.txt b/third_party/abseil-cpp/absl/numeric/CMakeLists.txt index 781987dc88..26df5cf703 100644 --- a/third_party/abseil-cpp/absl/numeric/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/numeric/CMakeLists.txt @@ -38,7 +38,7 @@ absl_cc_test( absl::bits absl::core_headers absl::random_random - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -73,7 +73,7 @@ absl_cc_test( absl::core_headers absl::hash_testing absl::type_traits - gmock_main + GTest::gmock_main ) # component target diff --git a/third_party/abseil-cpp/absl/numeric/int128.h b/third_party/abseil-cpp/absl/numeric/int128.h index 0dd814a890..198aa19504 100644 --- a/third_party/abseil-cpp/absl/numeric/int128.h +++ b/third_party/abseil-cpp/absl/numeric/int128.h @@ -810,6 +810,14 @@ inline bool operator>=(uint128 lhs, uint128 rhs) { return !(lhs < rhs); } // Unary operators. +constexpr inline uint128 operator+(uint128 val) { + return val; +} + +constexpr inline int128 operator+(int128 val) { + return val; +} + inline uint128 operator-(uint128 val) { uint64_t hi = ~Uint128High64(val); uint64_t lo = ~Uint128Low64(val) + 1; @@ -817,27 +825,27 @@ inline uint128 operator-(uint128 val) { return MakeUint128(hi, lo); } -inline bool operator!(uint128 val) { +constexpr inline bool operator!(uint128 val) { return !Uint128High64(val) && !Uint128Low64(val); } // Logical operators. -inline uint128 operator~(uint128 val) { +constexpr inline uint128 operator~(uint128 val) { return MakeUint128(~Uint128High64(val), ~Uint128Low64(val)); } -inline uint128 operator|(uint128 lhs, uint128 rhs) { +constexpr inline uint128 operator|(uint128 lhs, uint128 rhs) { return MakeUint128(Uint128High64(lhs) | Uint128High64(rhs), Uint128Low64(lhs) | Uint128Low64(rhs)); } -inline uint128 operator&(uint128 lhs, uint128 rhs) { +constexpr inline uint128 operator&(uint128 lhs, uint128 rhs) { return MakeUint128(Uint128High64(lhs) & Uint128High64(rhs), Uint128Low64(lhs) & Uint128Low64(rhs)); } -inline uint128 operator^(uint128 lhs, uint128 rhs) { +constexpr inline uint128 operator^(uint128 lhs, uint128 rhs) { return MakeUint128(Uint128High64(lhs) ^ Uint128High64(rhs), Uint128Low64(lhs) ^ Uint128Low64(rhs)); } diff --git a/third_party/abseil-cpp/absl/numeric/int128_test.cc b/third_party/abseil-cpp/absl/numeric/int128_test.cc index bc86c714ac..c445d89a99 100644 --- a/third_party/abseil-cpp/absl/numeric/int128_test.cc +++ b/third_party/abseil-cpp/absl/numeric/int128_test.cc @@ -226,6 +226,11 @@ TEST(Uint128, AllTests) { EXPECT_EQ(test >>= 1, one); EXPECT_EQ(test <<= 1, two); + EXPECT_EQ(big, +big); + EXPECT_EQ(two, +two); + EXPECT_EQ(absl::Uint128Max(), +absl::Uint128Max()); + EXPECT_EQ(zero, +zero); + EXPECT_EQ(big, -(-big)); EXPECT_EQ(two, -((-one) - 1)); EXPECT_EQ(absl::Uint128Max(), -one); @@ -769,6 +774,19 @@ TEST(Int128, ComparisonTest) { } } +TEST(Int128, UnaryPlusTest) { + int64_t values64[] = {0, 1, 12345, 0x4000000000000000, + std::numeric_limits<int64_t>::max()}; + for (int64_t value : values64) { + SCOPED_TRACE(::testing::Message() << "value = " << value); + + EXPECT_EQ(absl::int128(value), +absl::int128(value)); + EXPECT_EQ(absl::int128(-value), +absl::int128(-value)); + EXPECT_EQ(absl::MakeInt128(value, 0), +absl::MakeInt128(value, 0)); + EXPECT_EQ(absl::MakeInt128(-value, 0), +absl::MakeInt128(-value, 0)); + } +} + TEST(Int128, UnaryNegationTest) { int64_t values64[] = {0, 1, 12345, 0x4000000000000000, std::numeric_limits<int64_t>::max()}; diff --git a/third_party/abseil-cpp/absl/random/CMakeLists.txt b/third_party/abseil-cpp/absl/random/CMakeLists.txt index 3009a0348a..9d1c67fb33 100644 --- a/third_party/abseil-cpp/absl/random/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/random/CMakeLists.txt @@ -62,8 +62,8 @@ absl_cc_test( absl::random_random absl::random_internal_sequence_urbg absl::fast_type_id - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -119,8 +119,8 @@ absl_cc_library( absl::type_traits absl::utility absl::variant - gmock - gtest + GTest::gmock + GTest::gtest TESTONLY ) @@ -136,8 +136,8 @@ absl_cc_test( DEPS absl::random_mocking_bit_gen absl::random_random - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -153,8 +153,8 @@ absl_cc_test( absl::random_bit_gen_ref absl::random_mocking_bit_gen absl::random_random - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_library( @@ -245,8 +245,8 @@ absl_cc_test( absl::random_random absl::random_internal_sequence_urbg absl::random_internal_pcg_engine - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -268,8 +268,8 @@ absl_cc_test( absl::raw_logging_internal absl::strings absl::str_format - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -285,8 +285,8 @@ absl_cc_test( absl::random_distributions absl::random_random absl::random_internal_distribution_test_util - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -301,8 +301,8 @@ absl_cc_test( absl::random_distributions absl::random_random absl::raw_logging_internal - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -322,8 +322,8 @@ absl_cc_test( absl::raw_logging_internal absl::strings absl::str_format - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -343,8 +343,8 @@ absl_cc_test( absl::random_random absl::raw_logging_internal absl::strings - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -367,8 +367,8 @@ absl_cc_test( absl::raw_logging_internal absl::strings absl::str_format - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -391,8 +391,8 @@ absl_cc_test( absl::raw_logging_internal absl::strings absl::str_format - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -414,8 +414,8 @@ absl_cc_test( absl::raw_logging_internal absl::strings absl::str_format - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -435,8 +435,8 @@ absl_cc_test( absl::random_random absl::raw_logging_internal absl::strings - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -456,8 +456,8 @@ absl_cc_test( absl::random_internal_sequence_urbg absl::random_random absl::strings - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -477,8 +477,8 @@ absl_cc_test( absl::random_random absl::raw_logging_internal absl::strings - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) absl_cc_test( @@ -492,7 +492,7 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_random - gtest_main + GTest::gtest_main ) absl_cc_test( @@ -508,8 +508,8 @@ absl_cc_test( absl::random_seed_sequences absl::random_internal_nonsecure_base absl::random_random - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -894,7 +894,7 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_internal_traits - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -911,7 +911,7 @@ absl_cc_test( absl::bits absl::flags absl::random_internal_generate_real - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -926,7 +926,7 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_internal_distribution_test_util - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -941,7 +941,7 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_internal_fastmath - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -957,8 +957,8 @@ absl_cc_test( DEPS absl::random_internal_explicit_seed_seq absl::random_seed_sequences - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -973,8 +973,8 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_internal_salted_seed_seq - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -990,7 +990,7 @@ absl_cc_test( DEPS absl::core_headers absl::random_internal_distribution_test_util - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1005,7 +1005,7 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_internal_fast_uniform_bits - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1024,7 +1024,7 @@ absl_cc_test( absl::random_distributions absl::random_seed_sequences absl::strings - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1039,8 +1039,8 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_internal_seed_material - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1057,7 +1057,7 @@ absl_cc_test( absl::random_internal_pool_urbg absl::span absl::type_traits - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1074,8 +1074,8 @@ absl_cc_test( absl::random_internal_explicit_seed_seq absl::random_internal_pcg_engine absl::time - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1094,8 +1094,8 @@ absl_cc_test( absl::raw_logging_internal absl::strings absl::time - gmock - gtest_main + GTest::gmock + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1111,7 +1111,7 @@ absl_cc_test( DEPS absl::random_internal_randen absl::type_traits - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1127,7 +1127,7 @@ absl_cc_test( DEPS absl::endian absl::random_internal_randen_slow - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1146,8 +1146,8 @@ absl_cc_test( absl::random_internal_randen_hwaes_impl absl::raw_logging_internal absl::str_format - gmock - gtest + GTest::gmock + GTest::gtest ) # Internal-only target, do not depend on directly. @@ -1178,7 +1178,7 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_internal_uniform_helper - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1193,7 +1193,7 @@ absl_cc_test( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::random_internal_iostream_state_saver - gtest_main + GTest::gtest_main ) # Internal-only target, do not depend on directly. @@ -1210,5 +1210,5 @@ absl_cc_test( absl::random_internal_wide_multiply absl::bits absl::int128 - gtest_main + GTest::gtest_main ) diff --git a/third_party/abseil-cpp/absl/random/beta_distribution_test.cc b/third_party/abseil-cpp/absl/random/beta_distribution_test.cc index 44cdfdd049..d980c969f7 100644 --- a/third_party/abseil-cpp/absl/random/beta_distribution_test.cc +++ b/third_party/abseil-cpp/absl/random/beta_distribution_test.cc @@ -15,6 +15,7 @@ #include "absl/random/beta_distribution.h" #include <algorithm> +#include <cfloat> #include <cstddef> #include <cstdint> #include <iterator> @@ -558,6 +559,14 @@ TEST(BetaDistributionTest, StabilityTest) { // dependencies of the distribution change, such as RandU64ToDouble, then this // is also likely to change. TEST(BetaDistributionTest, AlgorithmBounds) { +#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0 + // We're using an x87-compatible FPU, and intermediate operations are + // performed with 80-bit floats. This produces slightly different results from + // what we expect below. + GTEST_SKIP() + << "Skipping the test because we detected x87 floating-point semantics"; +#endif + { absl::random_internal::sequence_urbg urbg( {0x7fbe76c8b4395800ull, 0x8000000000000000ull}); diff --git a/third_party/abseil-cpp/absl/random/discrete_distribution_test.cc b/third_party/abseil-cpp/absl/random/discrete_distribution_test.cc index 6d007006ef..415b14cc76 100644 --- a/third_party/abseil-cpp/absl/random/discrete_distribution_test.cc +++ b/third_party/abseil-cpp/absl/random/discrete_distribution_test.cc @@ -99,6 +99,7 @@ TYPED_TEST(DiscreteDistributionTypeTest, Constructor) { } TEST(DiscreteDistributionTest, InitDiscreteDistribution) { + using testing::_; using testing::Pair; { @@ -111,8 +112,8 @@ TEST(DiscreteDistributionTest, InitDiscreteDistribution) { // Each bucket is p=1/3, so bucket 0 will send half it's traffic // to bucket 2, while the rest will retain all of their traffic. EXPECT_THAT(q, testing::ElementsAre(Pair(0.5, 2), // - Pair(1.0, 1), // - Pair(1.0, 2))); + Pair(1.0, _), // + Pair(1.0, _))); } { @@ -135,7 +136,7 @@ TEST(DiscreteDistributionTest, InitDiscreteDistribution) { EXPECT_THAT(q, testing::ElementsAre(Pair(b0, 3), // Pair(b1, 3), // - Pair(1.0, 2), // + Pair(1.0, _), // Pair(b3, 2), // Pair(b1, 3))); } diff --git a/third_party/abseil-cpp/absl/random/distributions_test.cc b/third_party/abseil-cpp/absl/random/distributions_test.cc index 5866a07257..d3a5dd75e5 100644 --- a/third_party/abseil-cpp/absl/random/distributions_test.cc +++ b/third_party/abseil-cpp/absl/random/distributions_test.cc @@ -14,6 +14,7 @@ #include "absl/random/distributions.h" +#include <cfloat> #include <cmath> #include <cstdint> #include <random> @@ -224,6 +225,15 @@ TEST_F(RandomDistributionsTest, UniformNoBounds) { TEST_F(RandomDistributionsTest, UniformNonsenseRanges) { // The ranges used in this test are undefined behavior. // The results are arbitrary and subject to future changes. + +#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0 + // We're using an x87-compatible FPU, and intermediate operations can be + // performed with 80-bit floats. This produces slightly different results from + // what we expect below. + GTEST_SKIP() + << "Skipping the test because we detected x87 floating-point semantics"; +#endif + absl::InsecureBitGen gen; // <uint> diff --git a/third_party/abseil-cpp/absl/random/exponential_distribution_test.cc b/third_party/abseil-cpp/absl/random/exponential_distribution_test.cc index af11d61c15..81a5d17bac 100644 --- a/third_party/abseil-cpp/absl/random/exponential_distribution_test.cc +++ b/third_party/abseil-cpp/absl/random/exponential_distribution_test.cc @@ -15,6 +15,7 @@ #include "absl/random/exponential_distribution.h" #include <algorithm> +#include <cfloat> #include <cmath> #include <cstddef> #include <cstdint> @@ -384,6 +385,15 @@ TEST(ExponentialDistributionTest, StabilityTest) { TEST(ExponentialDistributionTest, AlgorithmBounds) { // Relies on absl::uniform_real_distribution, so some of these comments // reference that. + +#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0 + // We're using an x87-compatible FPU, and intermediate operations can be + // performed with 80-bit floats. This produces slightly different results from + // what we expect below. + GTEST_SKIP() + << "Skipping the test because we detected x87 floating-point semantics"; +#endif + absl::exponential_distribution<double> dist; { diff --git a/third_party/abseil-cpp/absl/random/internal/pool_urbg.cc b/third_party/abseil-cpp/absl/random/internal/pool_urbg.cc index 5bee530770..725100a415 100644 --- a/third_party/abseil-cpp/absl/random/internal/pool_urbg.cc +++ b/third_party/abseil-cpp/absl/random/internal/pool_urbg.cc @@ -194,11 +194,10 @@ RandenPoolEntry* PoolAlignedAlloc() { // Not all the platforms that we build for have std::aligned_alloc, however // since we never free these objects, we can over allocate and munge the // pointers to the correct alignment. - void* memory = std::malloc(sizeof(RandenPoolEntry) + kAlignment); - auto x = reinterpret_cast<intptr_t>(memory); + intptr_t x = reinterpret_cast<intptr_t>( + new char[sizeof(RandenPoolEntry) + kAlignment]); auto y = x % kAlignment; - void* aligned = - (y == 0) ? memory : reinterpret_cast<void*>(x + kAlignment - y); + void* aligned = reinterpret_cast<void*>(y == 0 ? x : (x + kAlignment - y)); return new (aligned) RandenPoolEntry(); } diff --git a/third_party/abseil-cpp/absl/random/internal/seed_material.cc b/third_party/abseil-cpp/absl/random/internal/seed_material.cc index 4d38a57419..7c1d9efa42 100644 --- a/third_party/abseil-cpp/absl/random/internal/seed_material.cc +++ b/third_party/abseil-cpp/absl/random/internal/seed_material.cc @@ -28,6 +28,7 @@ #include <cstdlib> #include <cstring> +#include "absl/base/dynamic_annotations.h" #include "absl/base/internal/raw_logging.h" #include "absl/strings/ascii.h" #include "absl/strings/escaping.h" @@ -50,6 +51,12 @@ #endif +#if defined(__GLIBC__) && \ + (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 25)) +// glibc >= 2.25 has getentropy() +#define ABSL_RANDOM_USE_GET_ENTROPY 1 +#endif + #if defined(ABSL_RANDOM_USE_BCRYPT) #include <bcrypt.h> @@ -122,8 +129,32 @@ bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) { #else +#if defined(ABSL_RANDOM_USE_GET_ENTROPY) +// On *nix, use getentropy() if supported. Note that libc may support +// getentropy(), but the kernel may not, in which case this function will return +// false. +bool ReadSeedMaterialFromGetEntropy(absl::Span<uint32_t> values) { + auto buffer = reinterpret_cast<uint8_t*>(values.data()); + size_t buffer_size = sizeof(uint32_t) * values.size(); + while (buffer_size > 0) { + // getentropy() has a maximum permitted length of 256. + size_t to_read = std::min<size_t>(buffer_size, 256); + int result = getentropy(buffer, to_read); + if (result < 0) { + return false; + } + // https://github.com/google/sanitizers/issues/1173 + // MemorySanitizer can't see through getentropy(). + ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(buffer, to_read); + buffer += to_read; + buffer_size -= to_read; + } + return true; +} +#endif // defined(ABSL_RANDOM_GETENTROPY) + // On *nix, read entropy from /dev/urandom. -bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) { +bool ReadSeedMaterialFromDevURandom(absl::Span<uint32_t> values) { const char kEntropyFile[] = "/dev/urandom"; auto buffer = reinterpret_cast<uint8_t*>(values.data()); @@ -150,6 +181,17 @@ bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) { return success; } +bool ReadSeedMaterialFromOSEntropyImpl(absl::Span<uint32_t> values) { +#if defined(ABSL_RANDOM_USE_GET_ENTROPY) + if (ReadSeedMaterialFromGetEntropy(values)) { + return true; + } +#endif + // Libc may support getentropy, but the kernel may not, so we still have + // to fallback to ReadSeedMaterialFromDevURandom(). + return ReadSeedMaterialFromDevURandom(values); +} + #endif } // namespace diff --git a/third_party/abseil-cpp/absl/random/uniform_real_distribution_test.cc b/third_party/abseil-cpp/absl/random/uniform_real_distribution_test.cc index 18bcd3bce8..035bd284d1 100644 --- a/third_party/abseil-cpp/absl/random/uniform_real_distribution_test.cc +++ b/third_party/abseil-cpp/absl/random/uniform_real_distribution_test.cc @@ -14,6 +14,7 @@ #include "absl/random/uniform_real_distribution.h" +#include <cfloat> #include <cmath> #include <cstdint> #include <iterator> @@ -70,6 +71,14 @@ using RealTypes = TYPED_TEST_SUITE(UniformRealDistributionTest, RealTypes); TYPED_TEST(UniformRealDistributionTest, ParamSerializeTest) { +#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0 + // We're using an x87-compatible FPU, and intermediate operations are + // performed with 80-bit floats. This produces slightly different results from + // what we expect below. + GTEST_SKIP() + << "Skipping the test because we detected x87 floating-point semantics"; +#endif + using param_type = typename absl::uniform_real_distribution<TypeParam>::param_type; diff --git a/third_party/abseil-cpp/absl/status/CMakeLists.txt b/third_party/abseil-cpp/absl/status/CMakeLists.txt index f0d798a373..1248dff03e 100644 --- a/third_party/abseil-cpp/absl/status/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/status/CMakeLists.txt @@ -50,7 +50,7 @@ absl_cc_test( DEPS absl::status absl::strings - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -84,5 +84,5 @@ absl_cc_test( DEPS absl::status absl::statusor - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/status/internal/status_internal.h b/third_party/abseil-cpp/absl/status/internal/status_internal.h index 279f8f55be..ac12940a6d 100644 --- a/third_party/abseil-cpp/absl/status/internal/status_internal.h +++ b/third_party/abseil-cpp/absl/status/internal/status_internal.h @@ -19,6 +19,17 @@ #include "absl/container/inlined_vector.h" #include "absl/strings/cord.h" +#ifndef SWIG +// Disabled for SWIG as it doesn't parse attributes correctly. +namespace absl { +ABSL_NAMESPACE_BEGIN +// Returned Status objects may not be ignored. Codesearch doesn't handle ifdefs +// as part of a class definitions (b/6995610), so we use a forward declaration. +class ABSL_MUST_USE_RESULT Status; +ABSL_NAMESPACE_END +} // namespace absl +#endif // !SWIG + namespace absl { ABSL_NAMESPACE_BEGIN @@ -36,12 +47,12 @@ using Payloads = absl::InlinedVector<Payload, 1>; // Reference-counted representation of Status data. struct StatusRep { - StatusRep(absl::StatusCode code, std::string message, - std::unique_ptr<status_internal::Payloads> payloads) + StatusRep(absl::StatusCode code_arg, absl::string_view message_arg, + std::unique_ptr<status_internal::Payloads> payloads_arg) : ref(int32_t{1}), - code(code), - message(std::move(message)), - payloads(std::move(payloads)) {} + code(code_arg), + message(message_arg), + payloads(std::move(payloads_arg)) {} std::atomic<int32_t> ref; absl::StatusCode code; diff --git a/third_party/abseil-cpp/absl/status/status.cc b/third_party/abseil-cpp/absl/status/status.cc index 51a0d26897..5a5cd5c239 100644 --- a/third_party/abseil-cpp/absl/status/status.cc +++ b/third_party/abseil-cpp/absl/status/status.cc @@ -207,19 +207,10 @@ void Status::UnrefNonInlined(uintptr_t rep) { } } -uintptr_t Status::NewRep( - absl::StatusCode code, absl::string_view msg, - std::unique_ptr<status_internal::Payloads> payloads) { - status_internal::StatusRep* rep = new status_internal::StatusRep( - code, std::string(msg.data(), msg.size()), - std::move(payloads)); - return PointerToRep(rep); -} - Status::Status(absl::StatusCode code, absl::string_view msg) : rep_(CodeToInlinedRep(code)) { if (code != absl::StatusCode::kOk && !msg.empty()) { - rep_ = NewRep(code, msg, nullptr); + rep_ = PointerToRep(new status_internal::StatusRep(code, msg, nullptr)); } } @@ -238,9 +229,9 @@ absl::StatusCode Status::code() const { void Status::PrepareToModify() { ABSL_RAW_CHECK(!ok(), "PrepareToModify shouldn't be called on OK status."); if (IsInlined(rep_)) { - rep_ = - NewRep(static_cast<absl::StatusCode>(raw_code()), absl::string_view(), - nullptr); + rep_ = PointerToRep(new status_internal::StatusRep( + static_cast<absl::StatusCode>(raw_code()), absl::string_view(), + nullptr)); return; } @@ -251,8 +242,9 @@ void Status::PrepareToModify() { if (rep->payloads) { payloads = absl::make_unique<status_internal::Payloads>(*rep->payloads); } - rep_ = NewRep(rep->code, message(), - std::move(payloads)); + status_internal::StatusRep* const new_rep = new status_internal::StatusRep( + rep->code, message(), std::move(payloads)); + rep_ = PointerToRep(new_rep); UnrefNonInlined(rep_i); } } diff --git a/third_party/abseil-cpp/absl/status/status.h b/third_party/abseil-cpp/absl/status/status.h index df9e330c00..2e05f46e87 100644 --- a/third_party/abseil-cpp/absl/status/status.h +++ b/third_party/abseil-cpp/absl/status/status.h @@ -291,6 +291,10 @@ enum class StatusToStringMode : int { kWithNoExtraData = 0, // ToString will contain the payloads. kWithPayload = 1 << 0, + // ToString will include all the extra data this Status has. + kWithEverything = ~kWithNoExtraData, + // Default mode used by ToString. Its exact value might change in the future. + kDefault = kWithPayload, }; // absl::StatusToStringMode is specified as a bitmask type, which means the @@ -410,7 +414,12 @@ inline StatusToStringMode& operator^=(StatusToStringMode& lhs, // return result; // } // -class ABSL_MUST_USE_RESULT Status final { +// For documentation see https://abseil.io/docs/cpp/guides/status. +// +// Returned Status objects may not be ignored. status_internal.h has a forward +// declaration of the form +// class ABSL_MUST_USE_RESULT Status; +class Status final { public: // Constructors @@ -502,7 +511,7 @@ class ABSL_MUST_USE_RESULT Status final { // result, and the payloads to be printed use the status payload printer // mechanism (which is internal). std::string ToString( - StatusToStringMode mode = StatusToStringMode::kWithPayload) const; + StatusToStringMode mode = StatusToStringMode::kDefault) const; // Status::IgnoreError() // diff --git a/third_party/abseil-cpp/absl/status/status_test.cc b/third_party/abseil-cpp/absl/status/status_test.cc index 7116ba671f..1b038f6d98 100644 --- a/third_party/abseil-cpp/absl/status/status_test.cc +++ b/third_party/abseil-cpp/absl/status/status_test.cc @@ -36,7 +36,9 @@ TEST(StatusCode, InsertionOperator) { // its creator, and its classifier. struct ErrorTest { absl::StatusCode code; - using Creator = absl::Status (*)(absl::string_view); + using Creator = absl::Status (*)( + absl::string_view + ); using Classifier = bool (*)(const absl::Status&); Creator creator; Classifier classifier; @@ -78,7 +80,9 @@ TEST(Status, CreateAndClassify) { // expected error code and message. std::string message = absl::StrCat("error code ", test.code, " test message"); - absl::Status status = test.creator(message); + absl::Status status = test.creator( + message + ); EXPECT_EQ(test.code, status.code()); EXPECT_EQ(message, status.message()); @@ -292,6 +296,10 @@ TEST(Status, ToStringMode) { AllOf(HasSubstr("INTERNAL: fail"), HasSubstr("[foo='bar']"), HasSubstr("[bar='\\xff']"))); + EXPECT_THAT(s.ToString(absl::StatusToStringMode::kWithEverything), + AllOf(HasSubstr("INTERNAL: fail"), HasSubstr("[foo='bar']"), + HasSubstr("[bar='\\xff']"))); + EXPECT_THAT(s.ToString(~absl::StatusToStringMode::kWithPayload), AllOf(HasSubstr("INTERNAL: fail"), Not(HasSubstr("[foo='bar']")), Not(HasSubstr("[bar='\\xff']")))); diff --git a/third_party/abseil-cpp/absl/status/statusor.h b/third_party/abseil-cpp/absl/status/statusor.h index 469d486fdd..b7c55cc8ac 100644 --- a/third_party/abseil-cpp/absl/status/statusor.h +++ b/third_party/abseil-cpp/absl/status/statusor.h @@ -135,7 +135,7 @@ class ABSL_MUST_USE_RESULT StatusOr; // // NOTE: using `absl::StatusOr<T>::value()` when no valid value is present will // throw an exception if exceptions are enabled or terminate the process when -// execeptions are not enabled. +// exceptions are not enabled. // // Example: // diff --git a/third_party/abseil-cpp/absl/strings/CMakeLists.txt b/third_party/abseil-cpp/absl/strings/CMakeLists.txt index 3b7ae639f5..0246dc3851 100644 --- a/third_party/abseil-cpp/absl/strings/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/strings/CMakeLists.txt @@ -101,7 +101,7 @@ absl_cc_test( DEPS absl::strings absl::base - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -115,7 +115,7 @@ absl_cc_test( absl::strings absl::core_headers absl::fixed_array - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -128,7 +128,7 @@ absl_cc_test( DEPS absl::strings absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -142,7 +142,7 @@ absl_cc_test( DEPS absl::strings absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -156,7 +156,7 @@ absl_cc_test( absl::strings_internal absl::base absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -169,7 +169,7 @@ absl_cc_test( DEPS absl::strings absl::type_traits - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -184,7 +184,7 @@ absl_cc_test( absl::config absl::core_headers absl::dynamic_annotations - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -197,7 +197,7 @@ absl_cc_test( DEPS absl::strings absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -209,7 +209,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::strings - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -221,12 +221,12 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::strings - absl::base absl::core_headers absl::dynamic_annotations + absl::btree absl::flat_hash_map absl::node_hash_map - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -238,7 +238,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::strings_internal - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -253,7 +253,7 @@ absl_cc_test( absl::base absl::core_headers absl::type_traits - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -268,7 +268,7 @@ absl_cc_test( absl::base absl::core_headers absl::memory - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -281,7 +281,7 @@ absl_cc_test( DEPS absl::strings absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -301,7 +301,7 @@ absl_cc_test( absl::random_random absl::random_distributions absl::strings_internal - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -314,7 +314,7 @@ absl_cc_test( DEPS absl::strings absl::base - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -326,7 +326,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::strings_internal - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -340,7 +340,7 @@ absl_cc_test( absl::strings absl::str_format absl::pow10_helper - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -355,7 +355,7 @@ absl_cc_test( absl::strings absl::config absl::raw_logging_internal - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -370,7 +370,7 @@ absl_cc_test( DEPS absl::strings absl::config - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -428,7 +428,7 @@ absl_cc_test( absl::cord absl::strings absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -442,7 +442,7 @@ absl_cc_test( absl::str_format absl::str_format_internal absl::strings - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -455,7 +455,7 @@ absl_cc_test( DEPS absl::str_format absl::str_format_internal - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -467,7 +467,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::str_format_internal - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -479,7 +479,7 @@ absl_cc_test( ${ABSL_TEST_COPTS} DEPS absl::str_format - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -494,7 +494,7 @@ absl_cc_test( absl::str_format_internal absl::raw_logging_internal absl::int128 - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -507,7 +507,7 @@ absl_cc_test( DEPS absl::str_format_internal absl::cord - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -520,7 +520,7 @@ absl_cc_test( DEPS absl::str_format_internal absl::core_headers - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -547,39 +547,300 @@ absl_cc_test( DEPS absl::pow10_helper absl::str_format - gmock_main + GTest::gmock_main ) absl_cc_library( NAME - cord + cord_internal HDRS - "cord.h" - SRCS - "cord.cc" - "internal/cord_internal.cc" "internal/cord_internal.h" + "internal/cord_rep_flat.h" "internal/cord_rep_ring.h" - "internal/cord_rep_ring.cc" "internal/cord_rep_ring_reader.h" - "internal/cord_rep_flat.h" + SRCS + "internal/cord_internal.cc" + "internal/cord_rep_ring.cc" COPTS ${ABSL_DEFAULT_COPTS} DEPS - absl::base absl::base_internal absl::compressed_tuple absl::config absl::core_headers absl::endian + absl::inlined_vector + absl::layout + absl::raw_logging_internal + absl::strings + absl::throw_delegate + absl::type_traits +) + +absl_cc_library( + NAME + cordz_update_tracker + HDRS + "internal/cordz_update_tracker.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config +) + +absl_cc_test( + NAME + cordz_update_tracker_test + SRCS + "internal/cordz_update_tracker_test.cc" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::config + absl::cordz_update_tracker + absl::core_headers + absl::synchronization + GTest::gmock_main +) + +absl_cc_library( + NAME + cordz_functions + HDRS + "internal/cordz_functions.h" + SRCS + "internal/cordz_functions.cc" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config + absl::core_headers + absl::exponential_biased + absl::raw_logging_internal +) + +absl_cc_test( + NAME + cordz_functions_test + SRCS + "internal/cordz_functions_test.cc" + DEPS + absl::config + absl::cordz_functions + absl::cordz_test_helpers + GTest::gmock_main +) + +absl_cc_library( + NAME + cordz_statistics + HDRS + "internal/cordz_statistics.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config + absl::core_headers + absl::cordz_update_tracker + absl::synchronization +) + +absl_cc_library( + NAME + cordz_handle + HDRS + "internal/cordz_handle.h" + SRCS + "internal/cordz_handle.cc" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::base + absl::config + absl::raw_logging_internal + absl::synchronization +) + +absl_cc_test( + NAME + cordz_handle_test + SRCS + "internal/cordz_handle_test.cc" + DEPS + absl::config + absl::cordz_handle + absl::cordz_test_helpers + absl::memory + absl::random_random + absl::random_distributions + absl::synchronization + absl::time + GTest::gmock_main +) + +absl_cc_library( + NAME + cordz_info + HDRS + "internal/cordz_info.h" + SRCS + "internal/cordz_info.cc" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::base + absl::config + absl::cord_internal + absl::cordz_functions + absl::cordz_handle + absl::cordz_statistics + absl::cordz_update_tracker + absl::core_headers + absl::inlined_vector + absl::span + absl::raw_logging_internal + absl::stacktrace + absl::synchronization +) + +absl_cc_test( + NAME + cordz_info_test + SRCS + "internal/cordz_info_test.cc" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::config + absl::cord_internal + absl::cordz_test_helpers + absl::cordz_handle + absl::cordz_info + absl::cordz_statistics + absl::cordz_test_helpers + absl::cordz_update_tracker + absl::span + absl::stacktrace + absl::symbolize + GTest::gmock_main +) + +absl_cc_test( + NAME + cordz_info_statistics_test + SRCS + "internal/cordz_info_statistics_test.cc" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::config + absl::cord + absl::cord_internal + absl::cordz_info + absl::cordz_sample_token + absl::cordz_statistics + absl::cordz_update_scope + absl::cordz_update_tracker + absl::thread_pool + GTest::gmock_main +) + +absl_cc_library( + NAME + cordz_sample_token + HDRS + "internal/cordz_sample_token.h" + SRCS + "internal/cordz_sample_token.cc" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config + absl::cordz_handle + absl::cordz_info +) + +absl_cc_test( + NAME + cordz_sample_token_test + SRCS + "internal/cordz_sample_token_test.cc" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::config + absl::cord_internal + absl::cordz_handle + absl::cordz_info + absl::cordz_info + absl::cordz_sample_token + absl::cordz_test_helpers + absl::memory + absl::random_random + absl::synchronization + absl::thread_pool + absl::time + GTest::gmock_main +) + +absl_cc_library( + NAME + cordz_update_scope + HDRS + "internal/cordz_update_scope.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config + absl::cord_internal + absl::cordz_info + absl::cordz_update_tracker + absl::core_headers +) + +absl_cc_test( + NAME + cordz_update_scope_test + SRCS + "internal/cordz_update_scope_test.cc" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::config + absl::cord_internal + absl::cordz_info + absl::cordz_test_helpers + absl::cordz_update_scope + absl::cordz_update_tracker + absl::core_headers + GTest::gmock_main +) + +absl_cc_library( + NAME + cord + HDRS + "cord.h" + SRCS + "cord.cc" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::base + absl::config + absl::cord_internal + absl::cordz_functions + absl::cordz_info + absl::cordz_update_scope + absl::cordz_update_tracker + absl::core_headers + absl::endian absl::fixed_array absl::function_ref absl::inlined_vector absl::optional absl::raw_logging_internal absl::strings - absl::strings_internal - absl::throw_delegate absl::type_traits PUBLIC ) @@ -592,7 +853,30 @@ absl_cc_library( COPTS ${ABSL_TEST_COPTS} DEPS + absl::config + absl::cord + absl::cord_internal + absl::strings + TESTONLY +) + +absl_cc_library( + NAME + cordz_test_helpers + HDRS + "cordz_test_helpers.h" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::config absl::cord + absl::cord_internal + absl::cordz_info + absl::cordz_sample_token + absl::cordz_statistics + absl::cordz_update_tracker + absl::core_headers + absl::strings TESTONLY ) @@ -609,11 +893,13 @@ absl_cc_test( absl::strings absl::base absl::config + absl::cord_test_helpers + absl::cordz_test_helpers absl::core_headers absl::endian absl::raw_logging_internal absl::fixed_array - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -624,13 +910,13 @@ absl_cc_test( COPTS ${ABSL_TEST_COPTS} DEPS - absl::config - absl::cord - absl::strings absl::base + absl::config + absl::cord_internal absl::core_headers absl::raw_logging_internal - gmock_main + absl::strings + GTest::gmock_main ) absl_cc_test( @@ -641,9 +927,33 @@ absl_cc_test( COPTS ${ABSL_TEST_COPTS} DEPS - absl::cord + absl::base + absl::cord_internal + absl::core_headers absl::strings + GTest::gmock_main +) + +absl_cc_test( + NAME + cordz_test + SRCS + "cordz_test.cc" + COPTS + ${ABSL_TEST_COPTS} + DEPS + absl::cord + absl::cord_test_helpers + absl::cordz_test_helpers + absl::cordz_functions + absl::cordz_info + absl::cordz_sample_token + absl::cordz_statistics + absl::cordz_update_tracker absl::base + absl::config absl::core_headers - gmock_main + absl::raw_logging_internal + absl::strings + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/strings/charconv.cc b/third_party/abseil-cpp/absl/strings/charconv.cc index b8674c2802..fefcfc90a5 100644 --- a/third_party/abseil-cpp/absl/strings/charconv.cc +++ b/third_party/abseil-cpp/absl/strings/charconv.cc @@ -111,7 +111,7 @@ struct FloatTraits<double> { return sign ? -ldexp(mantissa, exponent) : ldexp(mantissa, exponent); #else constexpr uint64_t kMantissaMask = - (uint64_t(1) << (kTargetMantissaBits - 1)) - 1; + (uint64_t{1} << (kTargetMantissaBits - 1)) - 1; uint64_t dbl = static_cast<uint64_t>(sign) << 63; if (mantissa > kMantissaMask) { // Normal value. @@ -151,7 +151,7 @@ struct FloatTraits<float> { return sign ? -ldexpf(mantissa, exponent) : ldexpf(mantissa, exponent); #else constexpr uint32_t kMantissaMask = - (uint32_t(1) << (kTargetMantissaBits - 1)) - 1; + (uint32_t{1} << (kTargetMantissaBits - 1)) - 1; uint32_t flt = static_cast<uint32_t>(sign) << 31; if (mantissa > kMantissaMask) { // Normal value. @@ -499,7 +499,7 @@ bool MustRoundUp(uint64_t guess_mantissa, int guess_exponent, template <typename FloatType> CalculatedFloat CalculatedFloatFromRawValues(uint64_t mantissa, int exponent) { CalculatedFloat result; - if (mantissa == uint64_t(1) << FloatTraits<FloatType>::kTargetMantissaBits) { + if (mantissa == uint64_t{1} << FloatTraits<FloatType>::kTargetMantissaBits) { mantissa >>= 1; exponent += 1; } diff --git a/third_party/abseil-cpp/absl/strings/cord.cc b/third_party/abseil-cpp/absl/strings/cord.cc index 93533757f5..f5aa6e4788 100644 --- a/third_party/abseil-cpp/absl/strings/cord.cc +++ b/third_party/abseil-cpp/absl/strings/cord.cc @@ -38,6 +38,9 @@ #include "absl/strings/internal/cord_internal.h" #include "absl/strings/internal/cord_rep_flat.h" #include "absl/strings/internal/cord_rep_ring.h" +#include "absl/strings/internal/cordz_statistics.h" +#include "absl/strings/internal/cordz_update_scope.h" +#include "absl/strings/internal/cordz_update_tracker.h" #include "absl/strings/internal/resize_uninitialized.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" @@ -53,8 +56,10 @@ using ::absl::cord_internal::CordRepExternal; using ::absl::cord_internal::CordRepFlat; using ::absl::cord_internal::CordRepRing; using ::absl::cord_internal::CordRepSubstring; -using ::absl::cord_internal::kMinFlatLength; +using ::absl::cord_internal::CordzUpdateTracker; +using ::absl::cord_internal::InlineData; using ::absl::cord_internal::kMaxFlatLength; +using ::absl::cord_internal::kMinFlatLength; using ::absl::cord_internal::CONCAT; using ::absl::cord_internal::EXTERNAL; @@ -206,7 +211,7 @@ static CordRep* MakeBalancedTree(CordRep** reps, size_t n) { } static CordRepFlat* CreateFlat(const char* data, size_t length, - size_t alloc_hint) { + size_t alloc_hint) { CordRepFlat* flat = CordRepFlat::New(length + alloc_hint); flat->length = length; memcpy(flat->Data(), data, length); @@ -230,9 +235,7 @@ static CordRep* RingNewTree(const char* data, size_t length, // Create a new tree out of the specified array. // The returned node has a refcount of 1. -static CordRep* NewTree(const char* data, - size_t length, - size_t alloc_hint) { +static CordRep* NewTree(const char* data, size_t length, size_t alloc_hint) { if (length == 0) return nullptr; if (cord_ring_enabled()) { return RingNewTree(data, length, alloc_hint); @@ -279,6 +282,35 @@ static CordRep* NewSubstring(CordRep* child, size_t offset, size_t length) { } } +// Creates a CordRep from the provided string. If the string is large enough, +// and not wasteful, we move the string into an external cord rep, preserving +// the already allocated string contents. +// Requires the provided string length to be larger than `kMaxInline`. +static CordRep* CordRepFromString(std::string&& src) { + assert(src.length() > cord_internal::kMaxInline); + if ( + // String is short: copy data to avoid external block overhead. + src.size() <= kMaxBytesToCopy || + // String is wasteful: copy data to avoid pinning too much unused memory. + src.size() < src.capacity() / 2 + ) { + return NewTree(src.data(), src.size(), 0); + } + + struct StringReleaser { + void operator()(absl::string_view /* data */) {} + std::string data; + }; + const absl::string_view original_data = src; + auto* rep = + static_cast<::absl::cord_internal::CordRepExternalImpl<StringReleaser>*>( + absl::cord_internal::NewExternalRep(original_data, + StringReleaser{std::move(src)})); + // Moving src may have invalidated its data pointer, so adjust it. + rep->base = rep->template get<0>().data.data(); + return rep; +} + // -------------------------------------------------------------------- // Cord::InlineRep functions @@ -299,20 +331,6 @@ inline char* Cord::InlineRep::set_data(size_t n) { return data_.as_chars(); } -inline CordRep* Cord::InlineRep::force_tree(size_t extra_hint) { - if (data_.is_tree()) { - return data_.as_tree(); - } - - size_t len = inline_size(); - CordRepFlat* result = CordRepFlat::New(len + extra_hint); - result->length = len; - static_assert(kMinFlatLength >= sizeof(data_), ""); - memcpy(result->Data(), data_.as_chars(), sizeof(data_)); - set_tree(result); - return result; -} - inline void Cord::InlineRep::reduce_size(size_t n) { size_t tag = inline_size(); assert(tag <= kMaxInline); @@ -334,25 +352,72 @@ static CordRepRing* ForceRing(CordRep* rep, size_t extra) { return (rep->tag == RING) ? rep->ring() : CordRepRing::Create(rep, extra); } -void Cord::InlineRep::AppendTree(CordRep* tree) { +void Cord::InlineRep::AppendTreeToInlined(CordRep* tree, + MethodIdentifier method) { + assert(!is_tree()); + if (!data_.is_empty()) { + CordRepFlat* flat = MakeFlatWithExtraCapacity(0); + if (cord_ring_enabled()) { + tree = CordRepRing::Append(CordRepRing::Create(flat, 1), tree); + } else { + tree = Concat(flat, tree); + } + } + EmplaceTree(tree, method); +} + +void Cord::InlineRep::AppendTreeToTree(CordRep* tree, MethodIdentifier method) { + assert(is_tree()); + const CordzUpdateScope scope(data_.cordz_info(), method); + if (cord_ring_enabled()) { + tree = CordRepRing::Append(ForceRing(data_.as_tree(), 1), tree); + } else { + tree = Concat(data_.as_tree(), tree); + } + SetTree(tree, scope); +} + +void Cord::InlineRep::AppendTree(CordRep* tree, MethodIdentifier method) { if (tree == nullptr) return; - if (data_.is_empty()) { - set_tree(tree); - } else if (cord_ring_enabled()) { - set_tree(CordRepRing::Append(ForceRing(force_tree(0), 1), tree)); + if (data_.is_tree()) { + AppendTreeToTree(tree, method); } else { - set_tree(Concat(force_tree(0), tree)); + AppendTreeToInlined(tree, method); } } -void Cord::InlineRep::PrependTree(CordRep* tree) { +void Cord::InlineRep::PrependTreeToInlined(CordRep* tree, + MethodIdentifier method) { + assert(!is_tree()); + if (!data_.is_empty()) { + CordRepFlat* flat = MakeFlatWithExtraCapacity(0); + if (cord_ring_enabled()) { + tree = CordRepRing::Prepend(CordRepRing::Create(flat, 1), tree); + } else { + tree = Concat(tree, flat); + } + } + EmplaceTree(tree, method); +} + +void Cord::InlineRep::PrependTreeToTree(CordRep* tree, + MethodIdentifier method) { + assert(is_tree()); + const CordzUpdateScope scope(data_.cordz_info(), method); + if (cord_ring_enabled()) { + tree = CordRepRing::Prepend(ForceRing(data_.as_tree(), 1), tree); + } else { + tree = Concat(tree, data_.as_tree()); + } + SetTree(tree, scope); +} + +void Cord::InlineRep::PrependTree(CordRep* tree, MethodIdentifier method) { assert(tree != nullptr); - if (data_.is_empty()) { - set_tree(tree); - } else if (cord_ring_enabled()) { - set_tree(CordRepRing::Prepend(ForceRing(force_tree(0), 1), tree)); + if (data_.is_tree()) { + PrependTreeToTree(tree, method); } else { - set_tree(Concat(tree, force_tree(0))); + PrependTreeToInlined(tree, method); } } @@ -404,76 +469,43 @@ static inline bool PrepareAppendRegion(CordRep* root, char** region, return true; } +template <bool has_length> void Cord::InlineRep::GetAppendRegion(char** region, size_t* size, - size_t max_length) { - if (max_length == 0) { - *region = nullptr; - *size = 0; - return; - } - - // Try to fit in the inline buffer if possible. - if (!is_tree()) { - size_t inline_length = inline_size(); - if (max_length <= kMaxInline - inline_length) { - *region = data_.as_chars() + inline_length; - *size = max_length; - set_inline_size(inline_length + max_length); - return; - } - } - - CordRep* root = force_tree(max_length); - - if (PrepareAppendRegion(root, region, size, max_length)) { - return; - } - - // Allocate new node. - CordRepFlat* new_node = - CordRepFlat::New(std::max(static_cast<size_t>(root->length), max_length)); - new_node->length = std::min(new_node->Capacity(), max_length); - *region = new_node->Data(); - *size = new_node->length; - - if (cord_ring_enabled()) { - replace_tree(CordRepRing::Append(ForceRing(root, 1), new_node)); - return; - } - replace_tree(Concat(root, new_node)); -} - -void Cord::InlineRep::GetAppendRegion(char** region, size_t* size) { - const size_t max_length = std::numeric_limits<size_t>::max(); - - // Try to fit in the inline buffer if possible. - if (!data_.is_tree()) { - size_t inline_length = inline_size(); - if (inline_length < kMaxInline) { - *region = data_.as_chars() + inline_length; - *size = kMaxInline - inline_length; - set_inline_size(kMaxInline); + size_t length) { + auto constexpr method = CordzUpdateTracker::kGetAppendRegion; + + CordRep* root = tree(); + size_t sz = root ? root->length : inline_size(); + if (root == nullptr) { + size_t available = kMaxInline - sz; + if (available >= (has_length ? length : 1)) { + *region = data_.as_chars() + sz; + *size = has_length ? length : available; + set_inline_size(has_length ? sz + length : kMaxInline); return; } } - CordRep* root = force_tree(max_length); - - if (PrepareAppendRegion(root, region, size, max_length)) { + size_t extra = has_length ? length : (std::max)(sz, kMinFlatLength); + CordRep* rep = root ? root : MakeFlatWithExtraCapacity(extra); + CordzUpdateScope scope(root ? data_.cordz_info() : nullptr, method); + if (PrepareAppendRegion(rep, region, size, length)) { + CommitTree(root, rep, scope, method); return; } // Allocate new node. - CordRepFlat* new_node = CordRepFlat::New(root->length); - new_node->length = new_node->Capacity(); + CordRepFlat* new_node = CordRepFlat::New(extra); + new_node->length = std::min(new_node->Capacity(), length); *region = new_node->Data(); *size = new_node->length; if (cord_ring_enabled()) { - replace_tree(CordRepRing::Append(ForceRing(root, 1), new_node)); - return; + rep = CordRepRing::Append(ForceRing(rep, 1), new_node); + } else { + rep = Concat(rep, new_node); } - replace_tree(Concat(root, new_node)); + CommitTree(root, rep, scope, method); } // If the rep is a leaf, this will increment the value at total_mem_usage and @@ -484,68 +516,67 @@ static bool RepMemoryUsageLeaf(const CordRep* rep, size_t* total_mem_usage) { return true; } if (rep->tag == EXTERNAL) { - *total_mem_usage += sizeof(CordRepConcat) + rep->length; + // We don't know anything about the embedded / bound data, but we can safely + // assume it is 'at least' a word / pointer to data. In the future we may + // choose to use the 'data' byte as a tag to identify the types of some + // well-known externals, such as a std::string instance. + *total_mem_usage += + sizeof(cord_internal::CordRepExternalImpl<intptr_t>) + rep->length; return true; } return false; } void Cord::InlineRep::AssignSlow(const Cord::InlineRep& src) { - ClearSlow(); + assert(&src != this); + assert(is_tree() || src.is_tree()); + auto constexpr method = CordzUpdateTracker::kAssignCord; + if (ABSL_PREDICT_TRUE(!is_tree())) { + EmplaceTree(CordRep::Ref(src.as_tree()), src.data_, method); + return; + } - data_ = src.data_; - if (is_tree()) { - data_.set_profiled(false); - CordRep::Ref(tree()); - clear_cordz_info(); + CordRep* tree = as_tree(); + if (CordRep* src_tree = src.tree()) { + // Leave any existing `cordz_info` in place, and let MaybeTrackCord() + // decide if this cord should be (or remains to be) sampled or not. + data_.set_tree(CordRep::Ref(src_tree)); + CordzInfo::MaybeTrackCord(data_, src.data_, method); + } else { + CordzInfo::MaybeUntrackCord(data_.cordz_info()); + data_ = src.data_; } + CordRep::Unref(tree); } -void Cord::InlineRep::ClearSlow() { +void Cord::InlineRep::UnrefTree() { if (is_tree()) { + CordzInfo::MaybeUntrackCord(data_.cordz_info()); CordRep::Unref(tree()); } - ResetToEmpty(); } // -------------------------------------------------------------------- // Constructors and destructors -Cord::Cord(absl::string_view src) { +Cord::Cord(absl::string_view src, MethodIdentifier method) + : contents_(InlineData::kDefaultInit) { const size_t n = src.size(); if (n <= InlineRep::kMaxInline) { - contents_.set_data(src.data(), n, false); + contents_.set_data(src.data(), n, true); } else { - contents_.set_tree(NewTree(src.data(), n, 0)); + CordRep* rep = NewTree(src.data(), n, 0); + contents_.EmplaceTree(rep, method); } } template <typename T, Cord::EnableIfString<T>> -Cord::Cord(T&& src) { - if ( - // String is short: copy data to avoid external block overhead. - src.size() <= kMaxBytesToCopy || - // String is wasteful: copy data to avoid pinning too much unused memory. - src.size() < src.capacity() / 2 - ) { - if (src.size() <= InlineRep::kMaxInline) { - contents_.set_data(src.data(), src.size(), false); - } else { - contents_.set_tree(NewTree(src.data(), src.size(), 0)); - } +Cord::Cord(T&& src) : contents_(InlineData::kDefaultInit) { + if (src.size() <= InlineRep::kMaxInline) { + contents_.set_data(src.data(), src.size(), true); } else { - struct StringReleaser { - void operator()(absl::string_view /* data */) {} - std::string data; - }; - const absl::string_view original_data = src; - auto* rep = static_cast< - ::absl::cord_internal::CordRepExternalImpl<StringReleaser>*>( - absl::cord_internal::NewExternalRep( - original_data, StringReleaser{std::forward<T>(src)})); - // Moving src may have invalidated its data pointer, so adjust it. - rep->base = rep->template get<0>().data.data(); - contents_.set_tree(rep); + CordRep* rep = CordRepFromString(std::forward<T>(src)); + contents_.EmplaceTree(rep, CordzUpdateTracker::kConstructorString); } } @@ -554,9 +585,9 @@ template Cord::Cord(std::string&& src); // The destruction code is separate so that the compiler can determine // that it does not need to call the destructor on a moved-from Cord. void Cord::DestroyCordSlow() { - if (CordRep* tree = contents_.tree()) { - CordRep::Unref(VerifyTree(tree)); - } + assert(contents_.is_tree()); + CordzInfo::MaybeUntrackCord(contents_.cordz_info()); + CordRep::Unref(VerifyTree(contents_.as_tree())); } // -------------------------------------------------------------------- @@ -568,109 +599,117 @@ void Cord::Clear() { } } -Cord& Cord::operator=(absl::string_view src) { +Cord& Cord::AssignLargeString(std::string&& src) { + auto constexpr method = CordzUpdateTracker::kAssignString; + assert(src.size() > kMaxBytesToCopy); + CordRep* rep = CordRepFromString(std::move(src)); + if (CordRep* tree = contents_.tree()) { + CordzUpdateScope scope(contents_.cordz_info(), method); + contents_.SetTree(rep, scope); + CordRep::Unref(tree); + } else { + contents_.EmplaceTree(rep, method); + } + return *this; +} +Cord& Cord::operator=(absl::string_view src) { + auto constexpr method = CordzUpdateTracker::kAssignString; const char* data = src.data(); size_t length = src.size(); CordRep* tree = contents_.tree(); if (length <= InlineRep::kMaxInline) { - // Embed into this->contents_ + // Embed into this->contents_, which is somewhat subtle: + // - MaybeUntrackCord must be called before Unref(tree). + // - MaybeUntrackCord must be called before set_data() clobbers cordz_info. + // - set_data() must be called before Unref(tree) as it may reference tree. + if (tree != nullptr) CordzInfo::MaybeUntrackCord(contents_.cordz_info()); contents_.set_data(data, length, true); - if (tree) CordRep::Unref(tree); - return *this; - } - if (tree != nullptr && tree->tag >= FLAT && - tree->flat()->Capacity() >= length && - tree->refcount.IsOne()) { - // Copy in place if the existing FLAT node is reusable. - memmove(tree->flat()->Data(), data, length); - tree->length = length; - VerifyTree(tree); + if (tree != nullptr) CordRep::Unref(tree); return *this; } - contents_.set_tree(NewTree(data, length, 0)); - if (tree) CordRep::Unref(tree); - return *this; -} - -template <typename T, Cord::EnableIfString<T>> -Cord& Cord::operator=(T&& src) { - if (src.size() <= kMaxBytesToCopy) { - *this = absl::string_view(src); + if (tree != nullptr) { + CordzUpdateScope scope(contents_.cordz_info(), method); + if (tree->tag >= FLAT && tree->flat()->Capacity() >= length && + tree->refcount.IsOne()) { + // Copy in place if the existing FLAT node is reusable. + memmove(tree->flat()->Data(), data, length); + tree->length = length; + VerifyTree(tree); + return *this; + } + contents_.SetTree(NewTree(data, length, 0), scope); + CordRep::Unref(tree); } else { - *this = Cord(std::forward<T>(src)); + contents_.EmplaceTree(NewTree(data, length, 0), method); } return *this; } -template Cord& Cord::operator=(std::string&& src); - // TODO(sanjay): Move to Cord::InlineRep section of file. For now, // we keep it here to make diffs easier. -void Cord::InlineRep::AppendArray(const char* src_data, size_t src_size) { - if (src_size == 0) return; // memcpy(_, nullptr, 0) is undefined. +void Cord::InlineRep::AppendArray(absl::string_view src, + MethodIdentifier method) { + if (src.empty()) return; // memcpy(_, nullptr, 0) is undefined. size_t appended = 0; - CordRep* root = nullptr; - if (is_tree()) { - root = data_.as_tree(); + CordRep* rep = tree(); + const CordRep* const root = rep; + CordzUpdateScope scope(root ? cordz_info() : nullptr, method); + if (root != nullptr) { char* region; - if (PrepareAppendRegion(root, ®ion, &appended, src_size)) { - memcpy(region, src_data, appended); + if (PrepareAppendRegion(rep, ®ion, &appended, src.size())) { + memcpy(region, src.data(), appended); } } else { // Try to fit in the inline buffer if possible. size_t inline_length = inline_size(); - if (src_size <= kMaxInline - inline_length) { + if (src.size() <= kMaxInline - inline_length) { // Append new data to embedded array - memcpy(data_.as_chars() + inline_length, src_data, src_size); - set_inline_size(inline_length + src_size); + memcpy(data_.as_chars() + inline_length, src.data(), src.size()); + set_inline_size(inline_length + src.size()); return; } - // It is possible that src_data == data_, but when we transition from an - // InlineRep to a tree we need to assign data_ = root via set_tree. To - // avoid corrupting the source data before we copy it, delay calling - // set_tree until after we've copied data. + // Note: we don't concern ourselves if src aliases data stored in the + // inlined data of 'this', as we update the InlineData only at the end. // We are going from an inline size to beyond inline size. Make the new size // either double the inlined size, or the added size + 10%. - const size_t size1 = inline_length * 2 + src_size; - const size_t size2 = inline_length + src_size / 10; - root = CordRepFlat::New(std::max<size_t>(size1, size2)); - appended = std::min( - src_size, root->flat()->Capacity() - inline_length); - memcpy(root->flat()->Data(), data_.as_chars(), inline_length); - memcpy(root->flat()->Data() + inline_length, src_data, appended); - root->length = inline_length + appended; - set_tree(root); - } - - src_data += appended; - src_size -= appended; - if (src_size == 0) { + const size_t size1 = inline_length * 2 + src.size(); + const size_t size2 = inline_length + src.size() / 10; + rep = CordRepFlat::New(std::max<size_t>(size1, size2)); + appended = std::min(src.size(), rep->flat()->Capacity() - inline_length); + memcpy(rep->flat()->Data(), data_.as_chars(), inline_length); + memcpy(rep->flat()->Data() + inline_length, src.data(), appended); + rep->length = inline_length + appended; + } + + src.remove_prefix(appended); + if (src.empty()) { + CommitTree(root, rep, scope, method); return; } if (cord_ring_enabled()) { - absl::string_view data(src_data, src_size); - root = ForceRing(root, (data.size() - 1) / kMaxFlatLength + 1); - replace_tree(CordRepRing::Append(root->ring(), data)); - return; - } - - // Use new block(s) for any remaining bytes that were not handled above. - // Alloc extra memory only if the right child of the root of the new tree is - // going to be a FLAT node, which will permit further inplace appends. - size_t length = src_size; - if (src_size < kMaxFlatLength) { - // The new length is either - // - old size + 10% - // - old_size + src_size - // This will cause a reasonable conservative step-up in size that is still - // large enough to avoid excessive amounts of small fragments being added. - length = std::max<size_t>(root->length / 10, src_size); + rep = ForceRing(rep, (src.size() - 1) / kMaxFlatLength + 1); + rep = CordRepRing::Append(rep->ring(), src); + } else { + // Use new block(s) for any remaining bytes that were not handled above. + // Alloc extra memory only if the right child of the root of the new tree + // is going to be a FLAT node, which will permit further inplace appends. + size_t length = src.size(); + if (src.size() < kMaxFlatLength) { + // The new length is either + // - old size + 10% + // - old_size + src.size() + // This will cause a reasonable conservative step-up in size that is + // still large enough to avoid excessive amounts of small fragments + // being added. + length = std::max<size_t>(rep->length / 10, src.size()); + } + rep = Concat(rep, NewTree(src.data(), src.size(), length - src.size())); } - set_tree(Concat(root, NewTree(src_data, src_size, length - src_size))); + CommitTree(root, rep, scope, method); } inline CordRep* Cord::TakeRep() const& { @@ -685,10 +724,17 @@ inline CordRep* Cord::TakeRep() && { template <typename C> inline void Cord::AppendImpl(C&& src) { + auto constexpr method = CordzUpdateTracker::kAppendCord; if (empty()) { - // In case of an empty destination avoid allocating a new node, do not copy - // data. - *this = std::forward<C>(src); + // Since destination is empty, we can avoid allocating a node, + if (src.contents_.is_tree()) { + // by taking the tree directly + CordRep* rep = std::forward<C>(src).TakeRep(); + contents_.EmplaceTree(rep, method); + } else { + // or copying over inline data + contents_.data_ = src.contents_.data_; + } return; } @@ -698,12 +744,12 @@ inline void Cord::AppendImpl(C&& src) { CordRep* src_tree = src.contents_.tree(); if (src_tree == nullptr) { // src has embedded data. - contents_.AppendArray(src.contents_.data(), src_size); + contents_.AppendArray({src.contents_.data(), src_size}, method); return; } if (src_tree->tag >= FLAT) { // src tree just has one flat node. - contents_.AppendArray(src_tree->flat()->Data(), src_size); + contents_.AppendArray({src_tree->flat()->Data(), src_size}, method); return; } if (&src == this) { @@ -719,7 +765,8 @@ inline void Cord::AppendImpl(C&& src) { } // Guaranteed to be a tree (kMaxBytesToCopy > kInlinedSize) - contents_.AppendTree(std::forward<C>(src).TakeRep()); + CordRep* rep = std::forward<C>(src).TakeRep(); + contents_.AppendTree(rep, CordzUpdateTracker::kAppendCord); } void Cord::Append(const Cord& src) { AppendImpl(src); } @@ -731,7 +778,8 @@ void Cord::Append(T&& src) { if (src.size() <= kMaxBytesToCopy) { Append(absl::string_view(src)); } else { - Append(Cord(std::forward<T>(src))); + CordRep* rep = CordRepFromString(std::forward<T>(src)); + contents_.AppendTree(rep, CordzUpdateTracker::kAppendString); } } @@ -741,7 +789,7 @@ void Cord::Prepend(const Cord& src) { CordRep* src_tree = src.contents_.tree(); if (src_tree != nullptr) { CordRep::Ref(src_tree); - contents_.PrependTree(src_tree); + contents_.PrependTree(src_tree, CordzUpdateTracker::kPrependCord); return; } @@ -764,7 +812,8 @@ void Cord::Prepend(absl::string_view src) { return; } } - contents_.PrependTree(NewTree(src.data(), src.size(), 0)); + CordRep* rep = NewTree(src.data(), src.size(), 0); + contents_.PrependTree(rep, CordzUpdateTracker::kPrependString); } template <typename T, Cord::EnableIfString<T>> @@ -772,7 +821,8 @@ inline void Cord::Prepend(T&& src) { if (src.size() <= kMaxBytesToCopy) { Prepend(absl::string_view(src)); } else { - Prepend(Cord(std::forward<T>(src))); + CordRep* rep = CordRepFromString(std::forward<T>(src)); + contents_.PrependTree(rep, CordzUpdateTracker::kPrependString); } } @@ -870,12 +920,17 @@ void Cord::RemovePrefix(size_t n) { CordRep* tree = contents_.tree(); if (tree == nullptr) { contents_.remove_prefix(n); - } else if (tree->tag == RING) { - contents_.replace_tree(CordRepRing::RemovePrefix(tree->ring(), n)); } else { - CordRep* newrep = RemovePrefixFrom(tree, n); - CordRep::Unref(tree); - contents_.replace_tree(VerifyTree(newrep)); + auto constexpr method = CordzUpdateTracker::kRemovePrefix; + CordzUpdateScope scope(contents_.cordz_info(), method); + if (tree->tag == RING) { + tree = CordRepRing::RemovePrefix(tree->ring(), n); + } else { + CordRep* newrep = RemovePrefixFrom(tree, n); + CordRep::Unref(tree); + tree = VerifyTree(newrep); + } + contents_.SetTreeOrEmpty(tree, scope); } } @@ -886,12 +941,17 @@ void Cord::RemoveSuffix(size_t n) { CordRep* tree = contents_.tree(); if (tree == nullptr) { contents_.reduce_size(n); - } else if (tree->tag == RING) { - contents_.replace_tree(CordRepRing::RemoveSuffix(tree->ring(), n)); } else { - CordRep* newrep = RemoveSuffixFrom(tree, n); - CordRep::Unref(tree); - contents_.replace_tree(VerifyTree(newrep)); + auto constexpr method = CordzUpdateTracker::kRemoveSuffix; + CordzUpdateScope scope(contents_.cordz_info(), method); + if (tree->tag == RING) { + tree = CordRepRing::RemoveSuffix(tree->ring(), n); + } else { + CordRep* newrep = RemoveSuffixFrom(tree, n); + CordRep::Unref(tree); + tree = VerifyTree(newrep); + } + contents_.SetTreeOrEmpty(tree, scope); } } @@ -951,17 +1011,20 @@ Cord Cord::Subcord(size_t pos, size_t new_size) const { size_t length = size(); if (pos > length) pos = length; if (new_size > length - pos) new_size = length - pos; + if (new_size == 0) return sub_cord; + CordRep* tree = contents_.tree(); if (tree == nullptr) { // sub_cord is newly constructed, no need to re-zero-out the tail of // contents_ memory. sub_cord.contents_.set_data(contents_.data() + pos, new_size, false); - } else if (new_size == 0) { - // We want to return empty subcord, so nothing to do. - } else if (new_size <= InlineRep::kMaxInline) { + return sub_cord; + } + + if (new_size <= InlineRep::kMaxInline) { + char* dest = sub_cord.contents_.data_.as_chars(); Cord::ChunkIterator it = chunk_begin(); it.AdvanceBytes(pos); - char* dest = sub_cord.contents_.data_.as_chars(); size_t remaining_size = new_size; while (remaining_size > it->size()) { cord_internal::SmallMemmove(dest, it->data(), it->size()); @@ -971,12 +1034,17 @@ Cord Cord::Subcord(size_t pos, size_t new_size) const { } cord_internal::SmallMemmove(dest, it->data(), remaining_size); sub_cord.contents_.set_inline_size(new_size); - } else if (tree->tag == RING) { - tree = CordRepRing::SubRing(CordRep::Ref(tree)->ring(), pos, new_size); - sub_cord.contents_.set_tree(tree); + return sub_cord; + } + + if (tree->tag == RING) { + CordRepRing* ring = CordRep::Ref(tree)->ring(); + tree = CordRepRing::SubRing(ring, pos, new_size); } else { - sub_cord.contents_.set_tree(NewSubRange(tree, pos, new_size)); + tree = NewSubRange(tree, pos, new_size); } + sub_cord.contents_.EmplaceTree(tree, contents_.data_, + CordzUpdateTracker::kSubCord); return sub_cord; } @@ -1418,6 +1486,7 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) { ABSL_HARDENING_ASSERT(bytes_remaining_ >= n && "Attempted to iterate past `end()`"); Cord subcord; + auto constexpr method = CordzUpdateTracker::kCordReader; if (n <= InlineRep::kMaxInline) { // Range to read fits in inline data. Flatten it. @@ -1440,11 +1509,12 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) { if (ring_reader_) { size_t chunk_size = current_chunk_.size(); if (n <= chunk_size && n <= kMaxBytesToCopy) { - subcord = Cord(current_chunk_.substr(0, n)); + subcord = Cord(current_chunk_.substr(0, n), method); } else { auto* ring = CordRep::Ref(ring_reader_.ring())->ring(); size_t offset = ring_reader_.length() - bytes_remaining_; - subcord.contents_.set_tree(CordRepRing::SubRing(ring, offset, n)); + CordRep* rep = CordRepRing::SubRing(ring, offset, n); + subcord.contents_.EmplaceTree(rep, method); } if (n < chunk_size) { bytes_remaining_ -= n; @@ -1463,7 +1533,7 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) { const char* data = subnode->tag == EXTERNAL ? subnode->external()->base : subnode->flat()->Data(); subnode = NewSubstring(subnode, current_chunk_.data() - data, n); - subcord.contents_.set_tree(VerifyTree(subnode)); + subcord.contents_.EmplaceTree(VerifyTree(subnode), method); RemoveChunkPrefix(n); return subcord; } @@ -1506,7 +1576,7 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) { if (node == nullptr) { // We have reached the end of the Cord. assert(bytes_remaining_ == 0); - subcord.contents_.set_tree(VerifyTree(subnode)); + subcord.contents_.EmplaceTree(VerifyTree(subnode), method); return subcord; } @@ -1546,7 +1616,7 @@ Cord Cord::ChunkIterator::AdvanceAndReadBytes(size_t n) { current_chunk_ = absl::string_view(data + offset + n, length - n); current_leaf_ = node; bytes_remaining_ -= n; - subcord.contents_.set_tree(VerifyTree(subnode)); + subcord.contents_.EmplaceTree(VerifyTree(subnode), method); return subcord; } @@ -1653,6 +1723,7 @@ char Cord::operator[](size_t i) const { } absl::string_view Cord::FlattenSlowPath() { + assert(contents_.is_tree()); size_t total_size = size(); CordRep* new_rep; char* new_buffer; @@ -1673,10 +1744,9 @@ absl::string_view Cord::FlattenSlowPath() { s.size()); }); } - if (CordRep* tree = contents_.tree()) { - CordRep::Unref(tree); - } - contents_.set_tree(new_rep); + CordzUpdateScope scope(contents_.cordz_info(), CordzUpdateTracker::kFlatten); + CordRep::Unref(contents_.as_tree()); + contents_.SetTree(new_rep, scope); return absl::string_view(new_buffer, total_size); } @@ -1688,6 +1758,8 @@ absl::string_view Cord::FlattenSlowPath() { } else if (rep->tag == EXTERNAL) { *fragment = absl::string_view(rep->external()->base, rep->length); return true; + } else if (rep->tag == RING) { + return rep->ring()->IsFlat(fragment); } else if (rep->tag == SUBSTRING) { CordRep* child = rep->substring()->child; if (child->tag >= FLAT) { @@ -1698,6 +1770,9 @@ absl::string_view Cord::FlattenSlowPath() { *fragment = absl::string_view( child->external()->base + rep->substring()->start, rep->length); return true; + } else if (child->tag == RING) { + return child->ring()->IsFlat(rep->substring()->start, rep->length, + fragment); } } return false; @@ -1786,8 +1861,7 @@ static void DumpNode(CordRep* rep, bool include_data, std::ostream* os, *os << absl::CEscape(std::string(rep->external()->base, rep->length)); *os << "]\n"; } else if (rep->tag >= FLAT) { - *os << "FLAT cap=" << rep->flat()->Capacity() - << " ["; + *os << "FLAT cap=" << rep->flat()->Capacity() << " ["; if (include_data) *os << absl::CEscape(std::string(rep->flat()->Data(), rep->length)); *os << "]\n"; @@ -1799,7 +1873,7 @@ static void DumpNode(CordRep* rep, bool include_data, std::ostream* os, do { DumpNode(ring->entry_child(head), include_data, os, indent + kIndentStep); - head = ring->advance(head);; + head = ring->advance(head); } while (head != ring->tail()); } if (stack.empty()) break; @@ -1845,9 +1919,8 @@ static bool VerifyNode(CordRep* root, CordRep* start_node, worklist.push_back(node->concat()->left); } } else if (node->tag >= FLAT) { - ABSL_INTERNAL_CHECK( - node->length <= node->flat()->Capacity(), - ReportError(root, node)); + ABSL_INTERNAL_CHECK(node->length <= node->flat()->Capacity(), + ReportError(root, node)); } else if (node->tag == EXTERNAL) { ABSL_INTERNAL_CHECK(node->external()->base != nullptr, ReportError(root, node)); diff --git a/third_party/abseil-cpp/absl/strings/cord.h b/third_party/abseil-cpp/absl/strings/cord.h index fa9cb913fd..e758f1cdfb 100644 --- a/third_party/abseil-cpp/absl/strings/cord.h +++ b/third_party/abseil-cpp/absl/strings/cord.h @@ -70,6 +70,7 @@ #include <string> #include <type_traits> +#include "absl/base/config.h" #include "absl/base/internal/endian.h" #include "absl/base/internal/per_thread_tls.h" #include "absl/base/macros.h" @@ -80,6 +81,11 @@ #include "absl/strings/internal/cord_internal.h" #include "absl/strings/internal/cord_rep_ring.h" #include "absl/strings/internal/cord_rep_ring_reader.h" +#include "absl/strings/internal/cordz_functions.h" +#include "absl/strings/internal/cordz_info.h" +#include "absl/strings/internal/cordz_statistics.h" +#include "absl/strings/internal/cordz_update_scope.h" +#include "absl/strings/internal/cordz_update_tracker.h" #include "absl/strings/internal/resize_uninitialized.h" #include "absl/strings/internal/string_constant.h" #include "absl/strings/string_view.h" @@ -664,10 +670,24 @@ class Cord { explicit constexpr Cord(strings_internal::StringConstant<T>); private: + using CordRep = absl::cord_internal::CordRep; + using CordRepFlat = absl::cord_internal::CordRepFlat; + using CordzInfo = cord_internal::CordzInfo; + using CordzUpdateScope = cord_internal::CordzUpdateScope; + using CordzUpdateTracker = cord_internal::CordzUpdateTracker; + using InlineData = cord_internal::InlineData; + using MethodIdentifier = CordzUpdateTracker::MethodIdentifier; + + // Creates a cord instance with `method` representing the originating + // public API call causing the cord to be created. + explicit Cord(absl::string_view src, MethodIdentifier method); + friend class CordTestPeer; friend bool operator==(const Cord& lhs, const Cord& rhs); friend bool operator==(const Cord& lhs, absl::string_view rhs); + friend const CordzInfo* GetCordzInfoForTesting(const Cord& cord); + // Calls the provided function once for each cord chunk, in order. Unlike // Chunks(), this API will not allocate memory. void ForEachChunk(absl::FunctionRef<void(absl::string_view)>) const; @@ -687,6 +707,7 @@ class Cord { static_assert(kMaxInline >= sizeof(absl::cord_internal::CordRep*), ""); constexpr InlineRep() : data_() {} + explicit InlineRep(InlineData::DefaultInitType init) : data_(init) {} InlineRep(const InlineRep& src); InlineRep(InlineRep&& src); InlineRep& operator=(const InlineRep& src); @@ -704,23 +725,56 @@ class Cord { // Returns nullptr if holding bytes absl::cord_internal::CordRep* tree() const; absl::cord_internal::CordRep* as_tree() const; - // Discards old pointer, if any - void set_tree(absl::cord_internal::CordRep* rep); - // Replaces a tree with a new root. This is faster than set_tree, but it - // should only be used when it's clear that the old rep was a tree. - void replace_tree(absl::cord_internal::CordRep* rep); // Returns non-null iff was holding a pointer absl::cord_internal::CordRep* clear(); // Converts to pointer if necessary. - absl::cord_internal::CordRep* force_tree(size_t extra_hint); void reduce_size(size_t n); // REQUIRES: holding data void remove_prefix(size_t n); // REQUIRES: holding data - void AppendArray(const char* src_data, size_t src_size); + void AppendArray(absl::string_view src, MethodIdentifier method); absl::string_view FindFlatStartPiece() const; - void AppendTree(absl::cord_internal::CordRep* tree); - void PrependTree(absl::cord_internal::CordRep* tree); - void GetAppendRegion(char** region, size_t* size, size_t max_length); - void GetAppendRegion(char** region, size_t* size); + + // Creates a CordRepFlat instance from the current inlined data with `extra' + // bytes of desired additional capacity. + CordRepFlat* MakeFlatWithExtraCapacity(size_t extra); + + // Sets the tree value for this instance. `rep` must not be null. + // Requires the current instance to hold a tree, and a lock to be held on + // any CordzInfo referenced by this instance. The latter is enforced through + // the CordzUpdateScope argument. If the current instance is sampled, then + // the CordzInfo instance is updated to reference the new `rep` value. + void SetTree(CordRep* rep, const CordzUpdateScope& scope); + + // Identical to SetTree(), except that `rep` is allowed to be null, in + // which case the current instance is reset to an empty value. + void SetTreeOrEmpty(CordRep* rep, const CordzUpdateScope& scope); + + // Sets the tree value for this instance, and randomly samples this cord. + // This function disregards existing contents in `data_`, and should be + // called when a Cord is 'promoted' from an 'uninitialized' or 'inlined' + // value to a non-inlined (tree / ring) value. + void EmplaceTree(CordRep* rep, MethodIdentifier method); + + // Identical to EmplaceTree, except that it copies the parent stack from + // the provided `parent` data if the parent is sampled. + void EmplaceTree(CordRep* rep, const InlineData& parent, + MethodIdentifier method); + + // Commits the change of a newly created, or updated `rep` root value into + // this cord. `old_rep` indicates the old (inlined or tree) value of the + // cord, and determines if the commit invokes SetTree() or EmplaceTree(). + void CommitTree(const CordRep* old_rep, CordRep* rep, + const CordzUpdateScope& scope, MethodIdentifier method); + + void AppendTreeToInlined(CordRep* tree, MethodIdentifier method); + void AppendTreeToTree(CordRep* tree, MethodIdentifier method); + void AppendTree(CordRep* tree, MethodIdentifier method); + void PrependTreeToInlined(CordRep* tree, MethodIdentifier method); + void PrependTreeToTree(CordRep* tree, MethodIdentifier method); + void PrependTree(CordRep* tree, MethodIdentifier method); + + template <bool has_length> + void GetAppendRegion(char** region, size_t* size, size_t length); + bool IsSame(const InlineRep& other) const { return memcmp(&data_, &other.data_, sizeof(data_)) == 0; } @@ -776,8 +830,8 @@ class Cord { friend class Cord; void AssignSlow(const InlineRep& src); - // Unrefs the tree, stops profiling, and zeroes the contents - void ClearSlow(); + // Unrefs the tree and stops profiling. + void UnrefTree(); void ResetToEmpty() { data_ = {}; } @@ -828,6 +882,10 @@ class Cord { template <typename C> void AppendImpl(C&& src); + // Assigns the value in 'src' to this instance, 'stealing' its contents. + // Requires src.length() > kMaxBytesToCopy. + Cord& AssignLargeString(std::string&& src); + // Helper for AbslHashValue(). template <typename H> H HashFragmented(H hash_state) const { @@ -930,8 +988,11 @@ inline CordRep* NewExternalRep(absl::string_view data, template <typename Releaser> Cord MakeCordFromExternal(absl::string_view data, Releaser&& releaser) { Cord cord; - cord.contents_.set_tree(::absl::cord_internal::NewExternalRep( - data, std::forward<Releaser>(releaser))); + if (auto* rep = ::absl::cord_internal::NewExternalRep( + data, std::forward<Releaser>(releaser))) { + cord.contents_.EmplaceTree(rep, + Cord::MethodIdentifier::kMakeCordFromExternal); + } return cord; } @@ -939,15 +1000,16 @@ constexpr Cord::InlineRep::InlineRep(cord_internal::InlineData data) : data_(data) {} inline Cord::InlineRep::InlineRep(const Cord::InlineRep& src) - : data_(src.data_) { - if (is_tree()) { - data_.clear_cordz_info(); - absl::cord_internal::CordRep::Ref(as_tree()); + : data_(InlineData::kDefaultInit) { + if (CordRep* tree = src.tree()) { + EmplaceTree(CordRep::Ref(tree), src.data_, + CordzUpdateTracker::kConstructorCord); + } else { + data_ = src.data_; } } -inline Cord::InlineRep::InlineRep(Cord::InlineRep&& src) { - data_ = src.data_; +inline Cord::InlineRep::InlineRep(Cord::InlineRep&& src) : data_(src.data_) { src.ResetToEmpty(); } @@ -966,7 +1028,7 @@ inline Cord::InlineRep& Cord::InlineRep::operator=(const Cord::InlineRep& src) { inline Cord::InlineRep& Cord::InlineRep::operator=( Cord::InlineRep&& src) noexcept { if (is_tree()) { - ClearSlow(); + UnrefTree(); } data_ = src.data_; src.ResetToEmpty(); @@ -1003,31 +1065,62 @@ inline size_t Cord::InlineRep::size() const { return is_tree() ? as_tree()->length : inline_size(); } -inline void Cord::InlineRep::set_tree(absl::cord_internal::CordRep* rep) { - if (rep == nullptr) { - ResetToEmpty(); +inline cord_internal::CordRepFlat* Cord::InlineRep::MakeFlatWithExtraCapacity( + size_t extra) { + static_assert(cord_internal::kMinFlatLength >= sizeof(data_), ""); + size_t len = data_.inline_size(); + auto* result = CordRepFlat::New(len + extra); + result->length = len; + memcpy(result->Data(), data_.as_chars(), sizeof(data_)); + return result; +} + +inline void Cord::InlineRep::EmplaceTree(CordRep* rep, + MethodIdentifier method) { + assert(rep); + data_.make_tree(rep); + CordzInfo::MaybeTrackCord(data_, method); +} + +inline void Cord::InlineRep::EmplaceTree(CordRep* rep, const InlineData& parent, + MethodIdentifier method) { + data_.make_tree(rep); + CordzInfo::MaybeTrackCord(data_, parent, method); +} + +inline void Cord::InlineRep::SetTree(CordRep* rep, + const CordzUpdateScope& scope) { + assert(rep); + assert(data_.is_tree()); + data_.set_tree(rep); + scope.SetCordRep(rep); +} + +inline void Cord::InlineRep::SetTreeOrEmpty(CordRep* rep, + const CordzUpdateScope& scope) { + assert(data_.is_tree()); + if (rep) { + data_.set_tree(rep); } else { - if (data_.is_tree()) { - // `data_` already holds a 'tree' value and an optional cordz_info value. - // Replace the tree value only, leaving the cordz_info value unchanged. - data_.set_tree(rep); - } else { - // `data_` contains inlined data: initialize data_ to tree value `rep`. - data_.make_tree(rep); - } + data_ = {}; } + scope.SetCordRep(rep); } -inline void Cord::InlineRep::replace_tree(absl::cord_internal::CordRep* rep) { - ABSL_ASSERT(is_tree()); - if (ABSL_PREDICT_FALSE(rep == nullptr)) { - set_tree(rep); - return; +inline void Cord::InlineRep::CommitTree(const CordRep* old_rep, CordRep* rep, + const CordzUpdateScope& scope, + MethodIdentifier method) { + if (old_rep) { + SetTree(rep, scope); + } else { + EmplaceTree(rep, method); } - data_.set_tree(rep); } inline absl::cord_internal::CordRep* Cord::InlineRep::clear() { + if (is_tree()) { + CordzInfo::MaybeUntrackCord(cordz_info()); + } absl::cord_internal::CordRep* result = tree(); ResetToEmpty(); return result; @@ -1042,6 +1135,9 @@ inline void Cord::InlineRep::CopyToArray(char* dst) const { constexpr inline Cord::Cord() noexcept {} +inline Cord::Cord(absl::string_view src) + : Cord(src, CordzUpdateTracker::kConstructorString) {} + template <typename T> constexpr Cord::Cord(strings_internal::StringConstant<T>) : contents_(strings_internal::StringConstant<T>::value.size() <= @@ -1057,6 +1153,15 @@ inline Cord& Cord::operator=(const Cord& x) { return *this; } +template <typename T, Cord::EnableIfString<T>> +Cord& Cord::operator=(T&& src) { + if (src.size() <= cord_internal::kMaxBytesToCopy) { + return operator=(absl::string_view(src)); + } else { + return AssignLargeString(std::forward<T>(src)); + } +} + inline Cord::Cord(const Cord& src) : contents_(src.contents_) {} inline Cord::Cord(Cord&& src) noexcept : contents_(std::move(src.contents_)) {} @@ -1071,7 +1176,6 @@ inline Cord& Cord::operator=(Cord&& x) noexcept { } extern template Cord::Cord(std::string&& src); -extern template Cord& Cord::operator=(std::string&& src); inline size_t Cord::size() const { // Length is 1st field in str.rep_ @@ -1114,7 +1218,7 @@ inline absl::string_view Cord::Flatten() { } inline void Cord::Append(absl::string_view src) { - contents_.AppendArray(src.data(), src.size()); + contents_.AppendArray(src, CordzUpdateTracker::kAppendString); } extern template void Cord::Append(std::string&& src); diff --git a/third_party/abseil-cpp/absl/strings/cord_ring_reader_test.cc b/third_party/abseil-cpp/absl/strings/cord_ring_reader_test.cc index 585616f3c0..d9a9a76d1e 100644 --- a/third_party/abseil-cpp/absl/strings/cord_ring_reader_test.cc +++ b/third_party/abseil-cpp/absl/strings/cord_ring_reader_test.cc @@ -78,6 +78,7 @@ TEST(CordRingReaderTest, Reset) { EXPECT_TRUE(static_cast<bool>(reader)); EXPECT_THAT(reader.ring(), Eq(ring)); EXPECT_THAT(reader.index(), Eq(ring->head())); + EXPECT_THAT(reader.node(), Eq(ring->entry_child(ring->head()))); EXPECT_THAT(reader.length(), Eq(ring->length)); EXPECT_THAT(reader.consumed(), Eq(flats[0].length())); EXPECT_THAT(reader.remaining(), Eq(ring->length - reader.consumed())); @@ -99,11 +100,13 @@ TEST(CordRingReaderTest, Next) { size_t consumed = reader.consumed(); size_t remaining = reader.remaining(); for (int i = 1; i < flats.size(); ++i) { + CordRepRing::index_type index = ring->advance(head, i); consumed += flats[i].length(); remaining -= flats[i].length(); absl::string_view next = reader.Next(); ASSERT_THAT(next, Eq(flats[i])); - ASSERT_THAT(reader.index(), Eq(ring->advance(head, i))); + ASSERT_THAT(reader.index(), Eq(index)); + ASSERT_THAT(reader.node(), Eq(ring->entry_child(index))); ASSERT_THAT(reader.consumed(), Eq(consumed)); ASSERT_THAT(reader.remaining(), Eq(remaining)); } @@ -125,13 +128,15 @@ TEST(CordRingReaderTest, SeekForward) { size_t consumed = 0; size_t remaining = ring->length;; for (int i = 0; i < flats.size(); ++i) { + CordRepRing::index_type index = ring->advance(head, i); size_t offset = consumed; consumed += flats[i].length(); remaining -= flats[i].length(); for (int off = 0; off < flats[i].length(); ++off) { absl::string_view chunk = reader.Seek(offset + off); ASSERT_THAT(chunk, Eq(flats[i].substr(off))); - ASSERT_THAT(reader.index(), Eq(ring->advance(head, i))); + ASSERT_THAT(reader.index(), Eq(index)); + ASSERT_THAT(reader.node(), Eq(ring->entry_child(index))); ASSERT_THAT(reader.consumed(), Eq(consumed)); ASSERT_THAT(reader.remaining(), Eq(remaining)); } @@ -150,11 +155,13 @@ TEST(CordRingReaderTest, SeekBackward) { size_t consumed = ring->length; size_t remaining = 0; for (int i = flats.size() - 1; i >= 0; --i) { + CordRepRing::index_type index = ring->advance(head, i); size_t offset = consumed - flats[i].length(); for (int off = 0; off < flats[i].length(); ++off) { absl::string_view chunk = reader.Seek(offset + off); ASSERT_THAT(chunk, Eq(flats[i].substr(off))); - ASSERT_THAT(reader.index(), Eq(ring->advance(head, i))); + ASSERT_THAT(reader.index(), Eq(index)); + ASSERT_THAT(reader.node(), Eq(ring->entry_child(index))); ASSERT_THAT(reader.consumed(), Eq(consumed)); ASSERT_THAT(reader.remaining(), Eq(remaining)); } diff --git a/third_party/abseil-cpp/absl/strings/cord_ring_test.cc b/third_party/abseil-cpp/absl/strings/cord_ring_test.cc index 7d75e106e7..cc8fbaf995 100644 --- a/third_party/abseil-cpp/absl/strings/cord_ring_test.cc +++ b/third_party/abseil-cpp/absl/strings/cord_ring_test.cc @@ -31,9 +31,6 @@ extern thread_local bool cord_ring; -// TOOD(b/177688959): weird things happened with the original test -#define ASAN_BUG_177688959_FIXED false - namespace absl { ABSL_NAMESPACE_BEGIN namespace { @@ -101,15 +98,22 @@ using TestParams = std::vector<TestParam>; // Matcher validating when mutable copies are required / performed. MATCHER_P2(EqIfPrivate, param, rep, absl::StrCat("Equal 0x", absl::Hex(rep), " if private")) { - return param.refcount_is_one ? arg == rep : arg != rep; + return param.refcount_is_one ? arg == rep : true; } // Matcher validating when mutable copies are required / performed. MATCHER_P2(EqIfPrivateAndCapacity, param, rep, absl::StrCat("Equal 0x", absl::Hex(rep), " if private and capacity")) { - return (param.refcount_is_one && param.with_capacity) ? arg == rep - : arg != rep; + return (param.refcount_is_one && param.with_capacity) ? arg == rep : true; +} + +// Matcher validating a shared ring was re-allocated. Should only be used for +// tests doing exactly one update as subsequent updates could return the +// original (freed and re-used) pointer. +MATCHER_P2(NeIfShared, param, rep, + absl::StrCat("Not equal 0x", absl::Hex(rep), " if shared")) { + return param.refcount_is_one ? true : arg != rep; } MATCHER_P2(EqIfInputPrivate, param, rep, "Equal if input is private") { @@ -340,19 +344,15 @@ std::string TestParamToString(const testing::TestParamInfo<TestParam>& info) { class CordRingTest : public testing::Test { public: ~CordRingTest() override { -#if ASAN_BUG_177688959_FIXED for (CordRep* rep : unrefs_) { CordRep::Unref(rep); } -#endif } template <typename CordRepType> CordRepType* NeedsUnref(CordRepType* rep) { assert(rep); -#if ASAN_BUG_177688959_FIXED unrefs_.push_back(rep); -#endif return rep; } @@ -362,26 +362,16 @@ class CordRingTest : public testing::Test { return NeedsUnref(rep); } - void Unref(CordRep* rep) { -#if !ASAN_BUG_177688959_FIXED - CordRep::Unref(rep); -#endif - } - private: -#if ASAN_BUG_177688959_FIXED std::vector<CordRep*> unrefs_; -#endif }; class CordRingTestWithParam : public testing::TestWithParam<TestParam> { public: ~CordRingTestWithParam() override { -#if ASAN_BUG_177688959_FIXED for (CordRep* rep : unrefs_) { CordRep::Unref(rep); } -#endif } CordRepRing* CreateWithCapacity(CordRep* child, size_t extra_capacity) { @@ -400,9 +390,7 @@ class CordRingTestWithParam : public testing::TestWithParam<TestParam> { template <typename CordRepType> CordRepType* NeedsUnref(CordRepType* rep) { assert(rep); -#if ASAN_BUG_177688959_FIXED unrefs_.push_back(rep); -#endif return rep; } @@ -412,43 +400,23 @@ class CordRingTestWithParam : public testing::TestWithParam<TestParam> { return NeedsUnref(rep); } - void Unref(CordRep* rep) { -#if !ASAN_BUG_177688959_FIXED - CordRep::Unref(rep); -#endif - } - template <typename CordRepType> CordRepType* RefIfShared(CordRepType* rep) { return Shared() ? Ref(rep) : rep; } - void UnrefIfShared(CordRep* rep) { - if (Shared()) Unref(rep); - } - template <typename CordRepType> CordRepType* RefIfInputShared(CordRepType* rep) { return InputShared() ? Ref(rep) : rep; } - void UnrefIfInputShared(CordRep* rep) { - if (InputShared()) Unref(rep); - } - template <typename CordRepType> CordRepType* RefIfInputSharedIndirect(CordRepType* rep) { return InputSharedIndirect() ? Ref(rep) : rep; } - void UnrefIfInputSharedIndirect(CordRep* rep) { - if (InputSharedIndirect()) Unref(rep); - } - private: -#if ASAN_BUG_177688959_FIXED std::vector<CordRep*> unrefs_; -#endif }; class CordRingCreateTest : public CordRingTestWithParam { @@ -520,26 +488,26 @@ class CordRingBuildInputTest : public CordRingTestWithParam { } }; -INSTANTIATE_TEST_CASE_P(WithParam, CordRingSubTest, - testing::ValuesIn(CordRingSubTest::CreateTestParams()), - TestParamToString); +INSTANTIATE_TEST_SUITE_P(WithParam, CordRingSubTest, + testing::ValuesIn(CordRingSubTest::CreateTestParams()), + TestParamToString); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( WithParam, CordRingCreateTest, testing::ValuesIn(CordRingCreateTest::CreateTestParams()), TestParamToString); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( WithParam, CordRingCreateFromTreeTest, testing::ValuesIn(CordRingCreateFromTreeTest::CreateTestParams()), TestParamToString); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( WithParam, CordRingBuildTest, testing::ValuesIn(CordRingBuildTest::CreateTestParams()), TestParamToString); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( WithParam, CordRingBuildInputTest, testing::ValuesIn(CordRingBuildInputTest::CreateTestParams()), TestParamToString); @@ -550,7 +518,6 @@ TEST_P(CordRingCreateTest, CreateFromFlat) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(str1.size())); EXPECT_THAT(ToFlats(result), ElementsAre(str1)); - Unref(result); } TEST_P(CordRingCreateTest, CreateFromRing) { @@ -558,9 +525,8 @@ TEST_P(CordRingCreateTest, CreateFromRing) { CordRepRing* result = NeedsUnref(CordRepRing::Create(ring)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivate(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAreArray(kFoxFlats)); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringRing) { @@ -570,23 +536,20 @@ TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringRing) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfInputPrivate(GetParam(), ring)); EXPECT_THAT(ToString(result), string_view(kFox).substr(2, 11)); - UnrefIfInputSharedIndirect(ring); - UnrefIfInputShared(sub); - Unref(result); } TEST_F(CordRingTest, CreateWithIllegalExtraCapacity) { - CordRep* flat = NeedsUnref(MakeFlat("Hello world")); #if defined(ABSL_HAVE_EXCEPTIONS) + CordRep* flat = NeedsUnref(MakeFlat("Hello world")); try { CordRepRing::Create(flat, CordRepRing::kMaxCapacity); GTEST_FAIL() << "expected std::length_error exception"; } catch (const std::length_error&) { } #elif defined(GTEST_HAS_DEATH_TEST) + CordRep* flat = NeedsUnref(MakeFlat("Hello world")); EXPECT_DEATH(CordRepRing::Create(flat, CordRepRing::kMaxCapacity), ".*"); #endif - Unref(flat); } TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfFlat) { @@ -597,9 +560,6 @@ TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfFlat) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(20)); EXPECT_THAT(ToFlats(result), ElementsAre(str1.substr(4, 20))); - Unref(result); - UnrefIfInputShared(flat); - UnrefIfInputSharedIndirect(child); } TEST_P(CordRingCreateTest, CreateFromExternal) { @@ -609,8 +569,6 @@ TEST_P(CordRingCreateTest, CreateFromExternal) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(str1.size())); EXPECT_THAT(ToFlats(result), ElementsAre(str1)); - Unref(result); - UnrefIfInputShared(child); } TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfExternal) { @@ -621,9 +579,6 @@ TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfExternal) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(24)); EXPECT_THAT(ToFlats(result), ElementsAre(str1.substr(1, 24))); - Unref(result); - UnrefIfInputShared(external); - UnrefIfInputSharedIndirect(child); } TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfLargeExternal) { @@ -637,9 +592,6 @@ TEST_P(CordRingCreateFromTreeTest, CreateFromSubstringOfLargeExternal) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(str.size())); EXPECT_THAT(ToRawFlats(result), ElementsAre(str)); - Unref(result); - UnrefIfInputShared(external); - UnrefIfInputSharedIndirect(child); } TEST_P(CordRingBuildInputTest, CreateFromConcat) { @@ -652,10 +604,6 @@ TEST_P(CordRingBuildInputTest, CreateFromConcat) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(26)); EXPECT_THAT(ToString(result), Eq(kAlphabet)); - UnrefIfInputSharedIndirect(flats[0]); - UnrefIfInputSharedIndirect(flats[3]); - UnrefIfInputShared(concat); - Unref(result); } TEST_P(CordRingBuildInputTest, CreateFromSubstringConcat) { @@ -671,10 +619,6 @@ TEST_P(CordRingBuildInputTest, CreateFromSubstringConcat) { ASSERT_THAT(result, IsValidRingBuffer()); ASSERT_THAT(result->length, Eq(len)); ASSERT_THAT(ToString(result), string_view(kAlphabet).substr(off, len)); - UnrefIfInputSharedIndirect(flats[0]); - UnrefIfInputSharedIndirect(flats[3]); - UnrefIfInputShared(child); - Unref(result); } } } @@ -689,7 +633,6 @@ TEST_P(CordRingCreateTest, Properties) { EXPECT_THAT(result->capacity(), Le(2 * 120 + 1)); EXPECT_THAT(result->entries(), Eq(1)); EXPECT_THAT(result->begin_pos(), Eq(0)); - Unref(result); } TEST_P(CordRingCreateTest, EntryForNewFlat) { @@ -700,7 +643,6 @@ TEST_P(CordRingCreateTest, EntryForNewFlat) { EXPECT_THAT(result->entry_child(0), Eq(child)); EXPECT_THAT(result->entry_end_pos(0), Eq(str1.length())); EXPECT_THAT(result->entry_data_offset(0), Eq(0)); - Unref(result); } TEST_P(CordRingCreateTest, EntryForNewFlatSubstring) { @@ -712,7 +654,6 @@ TEST_P(CordRingCreateTest, EntryForNewFlatSubstring) { EXPECT_THAT(result->entry_child(0), Eq(child)); EXPECT_THAT(result->entry_end_pos(0), Eq(26)); EXPECT_THAT(result->entry_data_offset(0), Eq(10)); - Unref(result); } TEST_P(CordRingBuildTest, AppendFlat) { @@ -722,10 +663,9 @@ TEST_P(CordRingBuildTest, AppendFlat) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, MakeFlat(str2))); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(result->length, Eq(str1.size() + str2.size())); EXPECT_THAT(ToFlats(result), ElementsAre(str1, str2)); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, PrependFlat) { @@ -735,10 +675,9 @@ TEST_P(CordRingBuildTest, PrependFlat) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, MakeFlat(str2))); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(result->length, Eq(str1.size() + str2.size())); EXPECT_THAT(ToFlats(result), ElementsAre(str2, str1)); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendString) { @@ -748,10 +687,9 @@ TEST_P(CordRingBuildTest, AppendString) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, str2)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(result->length, Eq(str1.size() + str2.size())); EXPECT_THAT(ToFlats(result), ElementsAre(str1, str2)); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendStringHavingExtra) { @@ -762,8 +700,7 @@ TEST_P(CordRingBuildTest, AppendStringHavingExtra) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(str1.size() + str2.size())); EXPECT_THAT(result, EqIfPrivate(GetParam(), ring)); - UnrefIfShared(ring); - Unref(result); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); } TEST_P(CordRingBuildTest, AppendStringHavingPartialExtra) { @@ -785,13 +722,12 @@ TEST_P(CordRingBuildTest, AppendStringHavingPartialExtra) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(str1.size() + str2.size())); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); if (GetParam().refcount_is_one) { EXPECT_THAT(ToFlats(result), ElementsAre(StrCat(str1, str1a), str2a)); } else { EXPECT_THAT(ToFlats(result), ElementsAre(str1, str2)); } - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendStringHavingExtraInSubstring) { @@ -802,14 +738,13 @@ TEST_P(CordRingBuildTest, AppendStringHavingExtraInSubstring) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, str2)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivate(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(result->length, Eq(4 + str2.size())); if (GetParam().refcount_is_one) { EXPECT_THAT(ToFlats(result), ElementsAre(StrCat("1234", str2))); } else { EXPECT_THAT(ToFlats(result), ElementsAre("1234", str2)); } - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendStringHavingSharedExtra) { @@ -837,10 +772,9 @@ TEST_P(CordRingBuildTest, AppendStringHavingSharedExtra) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, str2)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(result->length, Eq(4 + str2.size())); EXPECT_THAT(ToFlats(result), ElementsAre("1234", str2)); - UnrefIfShared(ring); - Unref(result); CordRep::Unref(shared_type == 1 ? flat1 : flat); } @@ -857,8 +791,6 @@ TEST_P(CordRingBuildTest, AppendStringWithExtra) { EXPECT_THAT(result->length, Eq(str1.size() + str2.size() + str3.size())); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre(str1, StrCat(str2, str3))); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, PrependString) { @@ -875,8 +807,6 @@ TEST_P(CordRingBuildTest, PrependString) { } EXPECT_THAT(result->length, Eq(str1.size() + str2.size())); EXPECT_THAT(ToFlats(result), ElementsAre(str2, str1)); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, PrependStringHavingExtra) { @@ -887,14 +817,13 @@ TEST_P(CordRingBuildTest, PrependStringHavingExtra) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, str2)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivate(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(result->length, Eq(4 + str2.size())); if (GetParam().refcount_is_one) { EXPECT_THAT(ToFlats(result), ElementsAre(StrCat(str2, "1234"))); } else { EXPECT_THAT(ToFlats(result), ElementsAre(str2, "1234")); } - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, PrependStringHavingSharedExtra) { @@ -920,9 +849,8 @@ TEST_P(CordRingBuildTest, PrependStringHavingSharedExtra) { ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result->length, Eq(str1a.size() + str2.size())); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre(str2, str1a)); - UnrefIfShared(ring); - Unref(result); CordRep::Unref(shared_type == 1 ? flat1 : flat); } } @@ -938,8 +866,6 @@ TEST_P(CordRingBuildTest, PrependStringWithExtra) { EXPECT_THAT(result->length, Eq(str1.size() + str2.size() + str3.size())); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre(StrCat(str3, str2), str1)); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendPrependStringMix) { @@ -950,12 +876,10 @@ TEST_P(CordRingBuildTest, AppendPrependStringMix) { result = CordRepRing::Prepend(result, flats[4 - i]); result = CordRepRing::Append(result, flats[4 + i]); } - UnrefIfShared(ring); NeedsUnref(result); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); EXPECT_THAT(ToString(result), kFox); - Unref(result); } TEST_P(CordRingBuildTest, AppendPrependStringMixWithExtra) { @@ -976,8 +900,6 @@ TEST_P(CordRingBuildTest, AppendPrependStringMixWithExtra) { EXPECT_THAT(ToFlats(result), ElementsAre("The quick brown fox ", "jumps ", "over the lazy dog")); } - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendPrependStringMixWithPrependedExtra) { @@ -998,8 +920,6 @@ TEST_P(CordRingBuildTest, AppendPrependStringMixWithPrependedExtra) { EXPECT_THAT(ToFlats(result), ElementsAre("The quick brown fox ", "jumps ", "over the lazy dog")); } - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingSubTest, SubRing) { @@ -1011,16 +931,14 @@ TEST_P(CordRingSubTest, SubRing) { CordRepRing* ring = RefIfShared(FromFlats(flats, composition)); CordRepRing* result = CordRepRing::SubRing(ring, offset, 0); EXPECT_THAT(result, nullptr); - UnrefIfShared(ring); for (size_t len = 1; len < all.size() - offset; ++len) { ring = RefIfShared(FromFlats(flats, composition)); result = NeedsUnref(CordRepRing::SubRing(ring, offset, len)); ASSERT_THAT(result, IsValidRingBuffer()); ASSERT_THAT(result, EqIfPrivate(GetParam(), ring)); + ASSERT_THAT(result, NeIfShared(GetParam(), ring)); ASSERT_THAT(ToString(result), Eq(all.substr(offset, len))); - UnrefIfShared(ring); - Unref(result); } } } @@ -1039,18 +957,16 @@ TEST_P(CordRingSubTest, SubRingFromLargeExternal) { CordRepRing* ring = RefIfShared(FromFlats(flats, composition)); CordRepRing* result = CordRepRing::SubRing(ring, offset, 0); EXPECT_THAT(result, nullptr); - UnrefIfShared(ring); for (size_t len = all.size() - 30; len < all.size() - offset; ++len) { ring = RefIfShared(FromFlats(flats, composition)); result = NeedsUnref(CordRepRing::SubRing(ring, offset, len)); ASSERT_THAT(result, IsValidRingBuffer()); ASSERT_THAT(result, EqIfPrivate(GetParam(), ring)); + ASSERT_THAT(result, NeIfShared(GetParam(), ring)); auto str = ToString(result); ASSERT_THAT(str, SizeIs(len)); ASSERT_THAT(str, Eq(all.substr(offset, len))); - UnrefIfShared(ring); - Unref(result); } } } @@ -1063,16 +979,14 @@ TEST_P(CordRingSubTest, RemovePrefix) { CordRepRing* ring = RefIfShared(FromFlats(flats, composition)); CordRepRing* result = CordRepRing::RemovePrefix(ring, all.size()); EXPECT_THAT(result, nullptr); - UnrefIfShared(ring); for (size_t len = 1; len < all.size(); ++len) { ring = RefIfShared(FromFlats(flats, composition)); result = NeedsUnref(CordRepRing::RemovePrefix(ring, len)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivate(GetParam(), ring)); + ASSERT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToString(result), Eq(all.substr(len))); - UnrefIfShared(ring); - Unref(result); } } @@ -1087,7 +1001,6 @@ TEST_P(CordRingSubTest, RemovePrefixFromLargeExternal) { ElementsAre( not_a_string_view(external1->base, 1 << 20).remove_prefix(1 << 16), not_a_string_view(external2->base, 1 << 20))); - Unref(result); } TEST_P(CordRingSubTest, RemoveSuffix) { @@ -1098,16 +1011,14 @@ TEST_P(CordRingSubTest, RemoveSuffix) { CordRepRing* ring = RefIfShared(FromFlats(flats, composition)); CordRepRing* result = CordRepRing::RemoveSuffix(ring, all.size()); EXPECT_THAT(result, nullptr); - UnrefIfShared(ring); for (size_t len = 1; len < all.size(); ++len) { ring = RefIfShared(FromFlats(flats, composition)); result = NeedsUnref(CordRepRing::RemoveSuffix(ring, len)); ASSERT_THAT(result, IsValidRingBuffer()); - EXPECT_THAT(result, EqIfPrivate(GetParam(), ring)); - EXPECT_THAT(ToString(result), Eq(all.substr(0, all.size() - len))); - UnrefIfShared(ring); - Unref(result); + ASSERT_THAT(result, EqIfPrivate(GetParam(), ring)); + ASSERT_THAT(result, NeIfShared(GetParam(), ring)); + ASSERT_THAT(ToString(result), Eq(all.substr(0, all.size() - len))); } } @@ -1120,9 +1031,8 @@ TEST_P(CordRingSubTest, AppendRing) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, child)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivate(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAreArray(kFoxFlats)); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, AppendRingWithFlatOffset) { @@ -1135,11 +1045,9 @@ TEST_P(CordRingBuildInputTest, AppendRingWithFlatOffset) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("Head", "brown ", "fox ", "jumps ", "over ", "the ", "lazy ", "dog")); - UnrefIfInputSharedIndirect(child); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, AppendRingWithBrokenOffset) { @@ -1152,11 +1060,9 @@ TEST_P(CordRingBuildInputTest, AppendRingWithBrokenOffset) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("Head", "umps ", "over ", "the ", "lazy ", "dog")); - UnrefIfInputSharedIndirect(child); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, AppendRingWithFlatLength) { @@ -1169,11 +1075,9 @@ TEST_P(CordRingBuildInputTest, AppendRingWithFlatLength) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("Head", "The ", "quick ", "brown ", "fox ", "jumps ", "over ", "the ")); - UnrefIfInputSharedIndirect(child); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendRingWithBrokenFlatLength) { @@ -1186,11 +1090,9 @@ TEST_P(CordRingBuildTest, AppendRingWithBrokenFlatLength) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("Head", "The ", "quick ", "brown ", "fox ", "jumps ", "ov")); - UnrefIfInputSharedIndirect(child); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendRingMiddlePiece) { @@ -1203,11 +1105,9 @@ TEST_P(CordRingBuildTest, AppendRingMiddlePiece) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("Head", "ck ", "brown ", "fox ", "jum")); - UnrefIfInputSharedIndirect(child); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildTest, AppendRingSinglePiece) { @@ -1220,11 +1120,8 @@ TEST_P(CordRingBuildTest, AppendRingSinglePiece) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("Head", "row")); - UnrefIfInputSharedIndirect(child); - UnrefIfInputShared(stripped); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, AppendRingSinglePieceWithPrefix) { @@ -1241,11 +1138,8 @@ TEST_P(CordRingBuildInputTest, AppendRingSinglePieceWithPrefix) { CordRepRing* result = NeedsUnref(CordRepRing::Append(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("Prepend", "Head", "row")); - UnrefIfInputSharedIndirect(child); - UnrefIfInputShared(stripped); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, PrependRing) { @@ -1258,10 +1152,8 @@ TEST_P(CordRingBuildInputTest, PrependRing) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, child)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAreArray(kFoxFlats)); - UnrefIfInputShared(child); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, PrependRingWithFlatOffset) { @@ -1274,12 +1166,9 @@ TEST_P(CordRingBuildInputTest, PrependRingWithFlatOffset) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("brown ", "fox ", "jumps ", "over ", "the ", "lazy ", "dog", "Tail")); - UnrefIfInputShared(child); - UnrefIfInputSharedIndirect(stripped); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, PrependRingWithBrokenOffset) { @@ -1291,12 +1180,9 @@ TEST_P(CordRingBuildInputTest, PrependRingWithBrokenOffset) { CordRep* stripped = RefIfInputSharedIndirect(RemovePrefix(21, child)); CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped)); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("umps ", "over ", "the ", "lazy ", "dog", "Tail")); - UnrefIfInputShared(child); - UnrefIfInputSharedIndirect(stripped); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, PrependRingWithFlatLength) { @@ -1309,12 +1195,9 @@ TEST_P(CordRingBuildInputTest, PrependRingWithFlatLength) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("The ", "quick ", "brown ", "fox ", "jumps ", "over ", "the ", "Tail")); - UnrefIfShared(ring); - UnrefIfInputShared(child); - UnrefIfInputSharedIndirect(stripped); - Unref(result); } TEST_P(CordRingBuildInputTest, PrependRingWithBrokenFlatLength) { @@ -1327,12 +1210,9 @@ TEST_P(CordRingBuildInputTest, PrependRingWithBrokenFlatLength) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("The ", "quick ", "brown ", "fox ", "jumps ", "ov", "Tail")); - UnrefIfInputShared(child); - UnrefIfInputSharedIndirect(stripped); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, PrependRingMiddlePiece) { @@ -1346,12 +1226,9 @@ TEST_P(CordRingBuildInputTest, PrependRingMiddlePiece) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("ck ", "brown ", "fox ", "jum", "Tail")); - UnrefIfInputShared(child); - UnrefIfInputSharedIndirect(stripped); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, PrependRingSinglePiece) { @@ -1364,11 +1241,8 @@ TEST_P(CordRingBuildInputTest, PrependRingSinglePiece) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("row", "Tail")); - UnrefIfInputShared(child); - UnrefIfInputSharedIndirect(stripped); - UnrefIfShared(ring); - Unref(result); } TEST_P(CordRingBuildInputTest, PrependRingSinglePieceWithPrefix) { @@ -1384,11 +1258,8 @@ TEST_P(CordRingBuildInputTest, PrependRingSinglePieceWithPrefix) { CordRepRing* result = NeedsUnref(CordRepRing::Prepend(ring, stripped)); ASSERT_THAT(result, IsValidRingBuffer()); EXPECT_THAT(result, EqIfPrivateAndCapacity(GetParam(), ring)); + EXPECT_THAT(result, NeIfShared(GetParam(), ring)); EXPECT_THAT(ToFlats(result), ElementsAre("row", "Prepend", "Tail")); - UnrefIfInputShared(child); - UnrefIfInputSharedIndirect(stripped); - UnrefIfShared(ring); - Unref(result); } TEST_F(CordRingTest, Find) { @@ -1406,7 +1277,6 @@ TEST_F(CordRingTest, Find) { ASSERT_THAT(found.offset, Lt(data.length())); ASSERT_THAT(data[found.offset], Eq(value[i])); } - Unref(ring); } TEST_F(CordRingTest, FindWithHint) { @@ -1442,7 +1312,6 @@ TEST_F(CordRingTest, FindWithHint) { ++flat_pos; flat_offset += flat.length(); } - Unref(ring); } TEST_F(CordRingTest, FindInLargeRing) { @@ -1464,7 +1333,6 @@ TEST_F(CordRingTest, FindInLargeRing) { ASSERT_THAT(pos.offset, Lt(data.length())); ASSERT_THAT(data[pos.offset], Eq(value[i])); } - Unref(ring); } TEST_F(CordRingTest, FindTail) { @@ -1483,7 +1351,6 @@ TEST_F(CordRingTest, FindTail) { ASSERT_THAT(pos.offset, Lt(data.length())); ASSERT_THAT(data[data.length() - pos.offset - 1], Eq(value[i])); } - Unref(ring); } TEST_F(CordRingTest, FindTailWithHint) { @@ -1510,7 +1377,6 @@ TEST_F(CordRingTest, FindTailWithHint) { ASSERT_THAT(pos.offset, Lt(data.length())); ASSERT_THAT(data[data.length() - pos.offset - 1], Eq(value[i])); } - Unref(ring); } TEST_F(CordRingTest, FindTailInLargeRing) { @@ -1532,7 +1398,6 @@ TEST_F(CordRingTest, FindTailInLargeRing) { ASSERT_THAT(pos.offset, Lt(data.length())); ASSERT_THAT(data[data.length() - pos.offset - 1], Eq(value[i])); } - Unref(ring); } TEST_F(CordRingTest, GetCharacter) { @@ -1544,7 +1409,6 @@ TEST_F(CordRingTest, GetCharacter) { for (int i = 0; i < value.length(); ++i) { ASSERT_THAT(result->GetCharacter(i), Eq(value[i])); } - Unref(result); } TEST_F(CordRingTest, GetCharacterWithSubstring) { @@ -1556,7 +1420,67 @@ TEST_F(CordRingTest, GetCharacterWithSubstring) { for (int i = 0; i < value.length(); ++i) { ASSERT_THAT(result->GetCharacter(i), Eq(value[i])); } - Unref(result); +} + +TEST_F(CordRingTest, IsFlatSingleFlat) { + for (bool external : {false, true}) { + SCOPED_TRACE(external ? "With External" : "With Flat"); + absl::string_view str = "Hello world"; + CordRep* rep = external ? MakeExternal(str) : MakeFlat(str); + CordRepRing* ring = NeedsUnref(CordRepRing::Create(rep)); + + // The ring is a single non-fragmented flat: + absl::string_view fragment; + EXPECT_TRUE(ring->IsFlat(nullptr)); + EXPECT_TRUE(ring->IsFlat(&fragment)); + EXPECT_THAT(fragment, Eq("Hello world")); + fragment = ""; + EXPECT_TRUE(ring->IsFlat(0, 11, nullptr)); + EXPECT_TRUE(ring->IsFlat(0, 11, &fragment)); + EXPECT_THAT(fragment, Eq("Hello world")); + + // Arbitrary ranges must check true as well. + EXPECT_TRUE(ring->IsFlat(1, 4, &fragment)); + EXPECT_THAT(fragment, Eq("ello")); + EXPECT_TRUE(ring->IsFlat(6, 5, &fragment)); + EXPECT_THAT(fragment, Eq("world")); + } +} + +TEST_F(CordRingTest, IsFlatMultiFlat) { + for (bool external : {false, true}) { + SCOPED_TRACE(external ? "With External" : "With Flat"); + absl::string_view str1 = "Hello world"; + absl::string_view str2 = "Halt and catch fire"; + CordRep* rep1 = external ? MakeExternal(str1) : MakeFlat(str1); + CordRep* rep2 = external ? MakeExternal(str2) : MakeFlat(str2); + CordRepRing* ring = CordRepRing::Append(CordRepRing::Create(rep1), rep2); + NeedsUnref(ring); + + // The ring is fragmented, IsFlat() on the entire cord must be false. + EXPECT_FALSE(ring->IsFlat(nullptr)); + absl::string_view fragment = "Don't touch this"; + EXPECT_FALSE(ring->IsFlat(&fragment)); + EXPECT_THAT(fragment, Eq("Don't touch this")); + + // Check for ranges exactly within both flats. + EXPECT_TRUE(ring->IsFlat(0, 11, &fragment)); + EXPECT_THAT(fragment, Eq("Hello world")); + EXPECT_TRUE(ring->IsFlat(11, 19, &fragment)); + EXPECT_THAT(fragment, Eq("Halt and catch fire")); + + // Check for arbitrary partial range inside each flat. + EXPECT_TRUE(ring->IsFlat(1, 4, &fragment)); + EXPECT_THAT(fragment, "ello"); + EXPECT_TRUE(ring->IsFlat(26, 4, &fragment)); + EXPECT_THAT(fragment, "fire"); + + // Check ranges spanning across both flats + fragment = "Don't touch this"; + EXPECT_FALSE(ring->IsFlat(1, 18, &fragment)); + EXPECT_FALSE(ring->IsFlat(10, 2, &fragment)); + EXPECT_THAT(fragment, Eq("Don't touch this")); + } } TEST_F(CordRingTest, Dump) { @@ -1564,7 +1488,6 @@ TEST_F(CordRingTest, Dump) { auto flats = MakeSpan(kFoxFlats); CordRepRing* ring = NeedsUnref(FromFlats(flats, kPrepend)); ss << *ring; - Unref(ring); } } // namespace diff --git a/third_party/abseil-cpp/absl/strings/cord_test.cc b/third_party/abseil-cpp/absl/strings/cord_test.cc index f9982428b3..14eca15573 100644 --- a/third_party/abseil-cpp/absl/strings/cord_test.cc +++ b/third_party/abseil-cpp/absl/strings/cord_test.cc @@ -35,6 +35,7 @@ #include "absl/base/macros.h" #include "absl/container/fixed_array.h" #include "absl/strings/cord_test_helpers.h" +#include "absl/strings/cordz_test_helpers.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/strings/string_view.h" @@ -187,6 +188,19 @@ class CordTestPeer { static cord_internal::CordzInfo* GetCordzInfo(const Cord& c) { return c.contents_.cordz_info(); } + + static Cord MakeSubstring(Cord src, size_t offset, size_t length) { + ABSL_RAW_CHECK(src.contents_.is_tree(), "Can not be inlined"); + Cord cord; + auto* rep = new cord_internal::CordRepSubstring; + rep->tag = cord_internal::SUBSTRING; + rep->child = cord_internal::CordRep::Ref(src.contents_.tree()); + rep->start = offset; + rep->length = length; + cord.contents_.EmplaceTree(rep, + cord_internal::CordzUpdateTracker::kSubCord); + return cord; + } }; ABSL_NAMESPACE_END @@ -227,7 +241,6 @@ TEST(GigabyteCord, FromExternal) { // caused crashes in production. We grow exponentially so that the code will // execute in a reasonable amount of time. absl::Cord c; - ABSL_RAW_LOG(INFO, "Made a Cord with %zu bytes!", c.size()); c.Append(from); while (c.size() < max_size) { c.Append(c); @@ -466,8 +479,8 @@ TEST(TryFlat, SubstrInlined) { TEST(TryFlat, SubstrFlat) { absl::Cord c("longer than 15 bytes"); - c.RemovePrefix(1); - EXPECT_EQ(c.TryFlat(), "onger than 15 bytes"); + absl::Cord sub = absl::CordTestPeer::MakeSubstring(c, 1, c.size() - 1); + EXPECT_EQ(sub.TryFlat(), "onger than 15 bytes"); } TEST(TryFlat, Concat) { @@ -482,16 +495,46 @@ TEST(TryFlat, External) { TEST(TryFlat, SubstrExternal) { absl::Cord c = absl::MakeCordFromExternal("hell", [](absl::string_view) {}); - c.RemovePrefix(1); - EXPECT_EQ(c.TryFlat(), "ell"); + absl::Cord sub = absl::CordTestPeer::MakeSubstring(c, 1, c.size() - 1); + EXPECT_EQ(sub.TryFlat(), "ell"); } TEST(TryFlat, SubstrConcat) { absl::Cord c = absl::MakeFragmentedCord({"hello", " world"}); + absl::Cord sub = absl::CordTestPeer::MakeSubstring(c, 1, c.size() - 1); + EXPECT_EQ(sub.TryFlat(), absl::nullopt); c.RemovePrefix(1); EXPECT_EQ(c.TryFlat(), absl::nullopt); } +TEST(TryFlat, CommonlyAssumedInvariants) { + // The behavior tested below is not part of the API contract of Cord, but it's + // something we intend to be true in our current implementation. This test + // exists to detect and prevent accidental breakage of the implementation. + absl::string_view fragments[] = {"A fragmented test", + " cord", + " to test subcords", + " of ", + "a", + " cord for", + " each chunk " + "returned by the ", + "iterator"}; + absl::Cord c = absl::MakeFragmentedCord(fragments); + int fragment = 0; + int offset = 0; + absl::Cord::CharIterator itc = c.char_begin(); + for (absl::string_view sv : c.Chunks()) { + absl::string_view expected = fragments[fragment]; + absl::Cord subcord1 = c.Subcord(offset, sv.length()); + absl::Cord subcord2 = absl::Cord::AdvanceAndRead(&itc, sv.size()); + EXPECT_EQ(subcord1.TryFlat(), expected); + EXPECT_EQ(subcord2.TryFlat(), expected); + ++fragment; + offset += sv.length(); + } +} + static bool IsFlat(const absl::Cord& c) { return c.chunk_begin() == c.chunk_end() || ++c.chunk_begin() == c.chunk_end(); } @@ -1274,6 +1317,26 @@ TEST(Cord, Concat_Append) { EXPECT_EQ(s2.size(), size + 1); } +TEST(Cord, DiabolicalGrowth) { + // This test exercises a diabolical Append(<one char>) on a cord, making the + // cord shared before each Append call resulting in a terribly fragmented + // resulting cord. + // TODO(b/183983616): Apply some minimum compaction when copying a shared + // source cord into a mutable copy for updates in CordRepRing. + RandomEngine rng(testing::GTEST_FLAG(random_seed)); + const std::string expected = RandomLowercaseString(&rng, 5000); + absl::Cord cord; + for (char c : expected) { + absl::Cord shared(cord); + cord.Append(absl::string_view(&c, 1)); + } + std::string value; + absl::CopyCordToString(cord, &value); + EXPECT_EQ(value, expected); + ABSL_RAW_LOG(INFO, "Diabolical size allocated = %zu", + cord.EstimatedMemoryUsage()); +} + TEST(MakeFragmentedCord, MakeFragmentedCordFromInitializerList) { absl::Cord fragmented = absl::MakeFragmentedCord({"A ", "fragmented ", "Cord"}); diff --git a/third_party/abseil-cpp/absl/strings/cord_test_helpers.h b/third_party/abseil-cpp/absl/strings/cord_test_helpers.h index f1036e3b13..31a1dc8980 100644 --- a/third_party/abseil-cpp/absl/strings/cord_test_helpers.h +++ b/third_party/abseil-cpp/absl/strings/cord_test_helpers.h @@ -17,11 +17,73 @@ #ifndef ABSL_STRINGS_CORD_TEST_HELPERS_H_ #define ABSL_STRINGS_CORD_TEST_HELPERS_H_ +#include <cstdint> +#include <iostream> +#include <string> + +#include "absl/base/config.h" #include "absl/strings/cord.h" +#include "absl/strings/internal/cord_internal.h" +#include "absl/strings/string_view.h" namespace absl { ABSL_NAMESPACE_BEGIN +// Cord sizes relevant for testing +enum class TestCordSize { + // An empty value + kEmpty = 0, + + // An inlined string value + kInlined = cord_internal::kMaxInline / 2 + 1, + + // 'Well known' SSO lengths (excluding terminating zero). + // libstdcxx has a maximum SSO of 15, libc++ has a maximum SSO of 22. + kStringSso1 = 15, + kStringSso2 = 22, + + // A string value which is too large to fit in inlined data, but small enough + // such that Cord prefers copying the value if possible, i.e.: not stealing + // std::string inputs, or referencing existing CordReps on Append, etc. + kSmall = cord_internal::kMaxBytesToCopy / 2 + 1, + + // A string value large enough that Cord prefers to reference or steal from + // existing inputs rather than copying contents of the input. + kMedium = cord_internal::kMaxFlatLength / 2 + 1, + + // A string value large enough to cause it to be stored in mutliple flats. + kLarge = cord_internal::kMaxFlatLength * 4 +}; + +// To string helper +inline absl::string_view ToString(TestCordSize size) { + switch (size) { + case TestCordSize::kEmpty: + return "Empty"; + case TestCordSize::kInlined: + return "Inlined"; + case TestCordSize::kSmall: + return "Small"; + case TestCordSize::kStringSso1: + return "StringSso1"; + case TestCordSize::kStringSso2: + return "StringSso2"; + case TestCordSize::kMedium: + return "Medium"; + case TestCordSize::kLarge: + return "Large"; + } + return "???"; +} + +// Returns the length matching the specified size +inline size_t Length(TestCordSize size) { return static_cast<size_t>(size); } + +// Stream output helper +inline std::ostream& operator<<(std::ostream& stream, TestCordSize size) { + return stream << ToString(size); +} + // Creates a multi-segment Cord from an iterable container of strings. The // resulting Cord is guaranteed to have one segment for every string in the // container. This allows code to be unit tested with multi-segment Cord diff --git a/third_party/abseil-cpp/absl/strings/cordz_test.cc b/third_party/abseil-cpp/absl/strings/cordz_test.cc new file mode 100644 index 0000000000..2b7d30b0e0 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/cordz_test.cc @@ -0,0 +1,466 @@ +// Copyright 2021 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <cstdint> +#include <string> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/config.h" +#include "absl/base/internal/raw_logging.h" +#include "absl/base/macros.h" +#include "absl/strings/cord.h" +#include "absl/strings/cord_test_helpers.h" +#include "absl/strings/cordz_test_helpers.h" +#include "absl/strings/internal/cordz_functions.h" +#include "absl/strings/internal/cordz_info.h" +#include "absl/strings/internal/cordz_sample_token.h" +#include "absl/strings/internal/cordz_statistics.h" +#include "absl/strings/internal/cordz_update_tracker.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" + +#ifdef ABSL_INTERNAL_CORDZ_ENABLED + +using testing::Eq; +using testing::AnyOf; + +namespace absl { +ABSL_NAMESPACE_BEGIN + +using cord_internal::CordzInfo; +using cord_internal::CordzSampleToken; +using cord_internal::CordzStatistics; +using cord_internal::CordzUpdateTracker; +using Method = CordzUpdateTracker::MethodIdentifier; + +// Do not print cord contents, we only care about 'size' perhaps. +// Note that this method must be inside the named namespace. +inline void PrintTo(const Cord& cord, std::ostream* s) { + if (s) *s << "Cord[" << cord.size() << "]"; +} + +namespace { + +auto constexpr kMaxInline = cord_internal::kMaxInline; + +// Returns a string_view value of the specified length +// We do this to avoid 'consuming' large strings in Cord by default. +absl::string_view MakeString(size_t size) { + thread_local std::string str; + str = std::string(size, '.'); + return str; +} + +absl::string_view MakeString(TestCordSize size) { + return MakeString(Length(size)); +} + +// Returns a cord with a sampled method of kAppendString. +absl::Cord MakeAppendStringCord(TestCordSize size) { + CordzSamplingIntervalHelper always(1); + absl::Cord cord; + cord.Append(MakeString(size)); + return cord; +} + +std::string TestParamToString(::testing::TestParamInfo<TestCordSize> size) { + return absl::StrCat("On", ToString(size.param), "Cord"); +} + +class CordzUpdateTest : public testing::TestWithParam<TestCordSize> { + public: + Cord& cord() { return cord_; } + + Method InitialOr(Method method) const { + return (GetParam() > TestCordSize::kInlined) ? Method::kConstructorString + : method; + } + + private: + CordzSamplingIntervalHelper sample_every_{1}; + Cord cord_{MakeString(GetParam())}; +}; + +template <typename T> +std::string ParamToString(::testing::TestParamInfo<T> param) { + return std::string(ToString(param.param)); +} + +INSTANTIATE_TEST_SUITE_P(WithParam, CordzUpdateTest, + testing::Values(TestCordSize::kEmpty, + TestCordSize::kInlined, + TestCordSize::kLarge), + TestParamToString); + +class CordzStringTest : public testing::TestWithParam<TestCordSize> { + private: + CordzSamplingIntervalHelper sample_every_{1}; +}; + +INSTANTIATE_TEST_SUITE_P(WithParam, CordzStringTest, + testing::Values(TestCordSize::kInlined, + TestCordSize::kStringSso1, + TestCordSize::kStringSso2, + TestCordSize::kSmall, + TestCordSize::kLarge), + ParamToString<TestCordSize>); + +TEST(CordzTest, ConstructSmallArray) { + CordzSamplingIntervalHelper sample_every{1}; + Cord cord(MakeString(TestCordSize::kSmall)); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); +} + +TEST(CordzTest, ConstructLargeArray) { + CordzSamplingIntervalHelper sample_every{1}; + Cord cord(MakeString(TestCordSize::kLarge)); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); +} + +TEST_P(CordzStringTest, ConstructString) { + CordzSamplingIntervalHelper sample_every{1}; + Cord cord(std::string(Length(GetParam()), '.')); + if (Length(GetParam()) > kMaxInline) { + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); + } +} + +TEST(CordzTest, CopyConstructFromUnsampled) { + CordzSamplingIntervalHelper sample_every{1}; + Cord src = UnsampledCord(MakeString(TestCordSize::kLarge)); + Cord cord(src); + EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr)); +} + +TEST(CordzTest, CopyConstructFromSampled) { + CordzSamplingIntervalHelper sample_never{99999}; + Cord src = MakeAppendStringCord(TestCordSize::kLarge); + Cord cord(src); + ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorCord)); + CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics(); + EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString)); + EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1)); +} + +TEST(CordzTest, MoveConstruct) { + CordzSamplingIntervalHelper sample_every{1}; + Cord src(MakeString(TestCordSize::kLarge)); + Cord cord(std::move(src)); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); +} + +TEST_P(CordzUpdateTest, AssignUnsampledCord) { + Cord src = UnsampledCord(MakeString(TestCordSize::kLarge)); + const CordzInfo* info = GetCordzInfoForTesting(cord()); + cord() = src; + EXPECT_THAT(GetCordzInfoForTesting(cord()), Eq(nullptr)); + EXPECT_FALSE(CordzInfoIsListed(info)); +} + +TEST_P(CordzUpdateTest, AssignSampledCord) { + Cord src = MakeAppendStringCord(TestCordSize::kLarge); + cord() = src; + ASSERT_THAT(cord(), HasValidCordzInfoOf(Method::kAssignCord)); + CordzStatistics stats = GetCordzInfoForTesting(cord())->GetCordzStatistics(); + EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString)); + EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1)); + EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0)); +} + +TEST(CordzUpdateTest, AssignSampledCordToInlined) { + CordzSamplingIntervalHelper sample_never{99999}; + Cord cord; + Cord src = MakeAppendStringCord(TestCordSize::kLarge); + cord = src; + ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kAssignCord)); + CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics(); + EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString)); + EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1)); + EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0)); +} + +TEST(CordzUpdateTest, AssignSampledCordToUnsampledCord) { + CordzSamplingIntervalHelper sample_never{99999}; + Cord cord = UnsampledCord(MakeString(TestCordSize::kLarge)); + Cord src = MakeAppendStringCord(TestCordSize::kLarge); + cord = src; + ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kAssignCord)); + CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics(); + EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString)); + EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1)); + EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0)); +} + +TEST(CordzUpdateTest, AssignUnsampledCordToSampledCordWithoutSampling) { + CordzSamplingIntervalHelper sample_never{99999}; + Cord cord = MakeAppendStringCord(TestCordSize::kLarge); + const CordzInfo* info = GetCordzInfoForTesting(cord); + Cord src = UnsampledCord(MakeString(TestCordSize::kLarge)); + cord = src; + EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr)); + EXPECT_FALSE(CordzInfoIsListed(info)); +} + +TEST(CordzUpdateTest, AssignUnsampledCordToSampledCordWithSampling) { + CordzSamplingIntervalHelper sample_every{1}; + Cord cord = MakeAppendStringCord(TestCordSize::kLarge); + const CordzInfo* info = GetCordzInfoForTesting(cord); + Cord src = UnsampledCord(MakeString(TestCordSize::kLarge)); + cord = src; + EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr)); + EXPECT_FALSE(CordzInfoIsListed(info)); +} + +TEST(CordzUpdateTest, AssignSampledCordToSampledCord) { + CordzSamplingIntervalHelper sample_every{1}; + Cord src = MakeAppendStringCord(TestCordSize::kLarge); + Cord cord(MakeString(TestCordSize::kLarge)); + cord = src; + ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kAssignCord)); + CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics(); + EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString)); + EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1)); + EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0)); +} + +TEST(CordzUpdateTest, AssignUnsampledCordToSampledCord) { + CordzSamplingIntervalHelper sample_every{1}; + Cord src = MakeAppendStringCord(TestCordSize::kLarge); + Cord cord(MakeString(TestCordSize::kLarge)); + cord = src; + ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kAssignCord)); + CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics(); + EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString)); + EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1)); + EXPECT_THAT(stats.update_tracker.Value(Method::kConstructorString), Eq(0)); +} + +TEST(CordzTest, AssignInlinedCordToSampledCord) { + CordzSampleToken token; + CordzSamplingIntervalHelper sample_every{1}; + Cord cord(MakeString(TestCordSize::kLarge)); + const CordzInfo* info = GetCordzInfoForTesting(cord); + Cord src = UnsampledCord(MakeString(TestCordSize::kInlined)); + cord = src; + EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr)); + EXPECT_FALSE(CordzInfoIsListed(info)); +} + +TEST(CordzUpdateTest, MoveAssignCord) { + CordzSamplingIntervalHelper sample_every{1}; + Cord cord; + Cord src(MakeString(TestCordSize::kLarge)); + cord = std::move(src); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); +} + +TEST_P(CordzUpdateTest, AssignLargeArray) { + cord() = MakeString(TestCordSize::kSmall); + EXPECT_THAT(cord(), HasValidCordzInfoOf(Method::kAssignString)); +} + +TEST_P(CordzUpdateTest, AssignSmallArray) { + cord() = MakeString(TestCordSize::kSmall); + EXPECT_THAT(cord(), HasValidCordzInfoOf(Method::kAssignString)); +} + +TEST_P(CordzUpdateTest, AssignInlinedArray) { + cord() = MakeString(TestCordSize::kInlined); + EXPECT_THAT(GetCordzInfoForTesting(cord()), Eq(nullptr)); +} + +TEST_P(CordzStringTest, AssignStringToInlined) { + Cord cord; + cord = std::string(Length(GetParam()), '.'); + if (Length(GetParam()) > kMaxInline) { + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kAssignString)); + } +} + +TEST_P(CordzStringTest, AssignStringToCord) { + Cord cord(MakeString(TestCordSize::kLarge)); + cord = std::string(Length(GetParam()), '.'); + if (Length(GetParam()) > kMaxInline) { + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); + EXPECT_THAT(cord, CordzMethodCountEq(Method::kAssignString, 1)); + } +} + +TEST_P(CordzUpdateTest, AssignInlinedString) { + cord() = std::string(Length(TestCordSize::kInlined), '.'); + EXPECT_THAT(GetCordzInfoForTesting(cord()), Eq(nullptr)); +} + +TEST_P(CordzUpdateTest, AppendCord) { + Cord src = UnsampledCord(MakeString(TestCordSize::kLarge)); + cord().Append(src); + EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kAppendCord))); +} + +TEST_P(CordzUpdateTest, MoveAppendCord) { + cord().Append(UnsampledCord(MakeString(TestCordSize::kLarge))); + EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kAppendCord))); +} + +TEST_P(CordzUpdateTest, AppendSmallArray) { + cord().Append(MakeString(TestCordSize::kSmall)); + EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kAppendString))); +} + +TEST_P(CordzUpdateTest, AppendLargeArray) { + cord().Append(MakeString(TestCordSize::kLarge)); + EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kAppendString))); +} + +TEST_P(CordzStringTest, AppendStringToEmpty) { + Cord cord; + cord.Append(std::string(Length(GetParam()), '.')); + if (Length(GetParam()) > kMaxInline) { + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kAppendString)); + } +} + +TEST_P(CordzStringTest, AppendStringToInlined) { + Cord cord(MakeString(TestCordSize::kInlined)); + cord.Append(std::string(Length(GetParam()), '.')); + if (Length(TestCordSize::kInlined) + Length(GetParam()) > kMaxInline) { + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kAppendString)); + } +} + +TEST_P(CordzStringTest, AppendStringToCord) { + Cord cord(MakeString(TestCordSize::kLarge)); + cord.Append(std::string(Length(GetParam()), '.')); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); + EXPECT_THAT(cord, CordzMethodCountEq(Method::kAppendString, 1)); +} + +TEST(CordzTest, MakeCordFromExternal) { + CordzSamplingIntervalHelper sample_every{1}; + Cord cord = MakeCordFromExternal("Hello world", [](absl::string_view) {}); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kMakeCordFromExternal)); +} + +TEST(CordzTest, MakeCordFromEmptyExternal) { + CordzSamplingIntervalHelper sample_every{1}; + Cord cord = MakeCordFromExternal({}, [](absl::string_view) {}); + EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr)); +} + +TEST_P(CordzUpdateTest, PrependCord) { + Cord src = UnsampledCord(MakeString(TestCordSize::kLarge)); + cord().Prepend(src); + EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kPrependCord))); +} + +TEST_P(CordzUpdateTest, PrependSmallArray) { + cord().Prepend(MakeString(TestCordSize::kSmall)); + EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kPrependString))); +} + +TEST_P(CordzUpdateTest, PrependLargeArray) { + cord().Prepend(MakeString(TestCordSize::kLarge)); + EXPECT_THAT(cord(), HasValidCordzInfoOf(InitialOr(Method::kPrependString))); +} + +TEST_P(CordzStringTest, PrependStringToEmpty) { + Cord cord; + cord.Prepend(std::string(Length(GetParam()), '.')); + if (Length(GetParam()) > kMaxInline) { + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kPrependString)); + } +} + +TEST_P(CordzStringTest, PrependStringToInlined) { + Cord cord(MakeString(TestCordSize::kInlined)); + cord.Prepend(std::string(Length(GetParam()), '.')); + if (Length(TestCordSize::kInlined) + Length(GetParam()) > kMaxInline) { + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kPrependString)); + } +} + +TEST_P(CordzStringTest, PrependStringToCord) { + Cord cord(MakeString(TestCordSize::kLarge)); + cord.Prepend(std::string(Length(GetParam()), '.')); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); + EXPECT_THAT(cord, CordzMethodCountEq(Method::kPrependString, 1)); +} + +TEST(CordzTest, RemovePrefix) { + CordzSamplingIntervalHelper sample_every(1); + Cord cord(MakeString(TestCordSize::kLarge)); + + // Half the cord + cord.RemovePrefix(cord.size() / 2); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); + EXPECT_THAT(cord, CordzMethodCountEq(Method::kRemovePrefix, 1)); + + // TODO(mvels): RemovePrefix does not reset to inlined, except if empty? + cord.RemovePrefix(cord.size() - kMaxInline); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); + EXPECT_THAT(cord, CordzMethodCountEq(Method::kRemovePrefix, 2)); + + cord.RemovePrefix(cord.size()); + EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr)); +} + +TEST(CordzTest, RemoveSuffix) { + CordzSamplingIntervalHelper sample_every(1); + Cord cord(MakeString(TestCordSize::kLarge)); + + // Half the cord + cord.RemoveSuffix(cord.size() / 2); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); + EXPECT_THAT(cord, CordzMethodCountEq(Method::kRemoveSuffix, 1)); + + // TODO(mvels): RemoveSuffix does not reset to inlined, except if empty? + cord.RemoveSuffix(cord.size() - kMaxInline); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kConstructorString)); + EXPECT_THAT(cord, CordzMethodCountEq(Method::kRemoveSuffix, 2)); + + cord.RemoveSuffix(cord.size()); + EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr)); +} + +TEST(CordzTest, SubCordFromUnsampledCord) { + CordzSamplingIntervalHelper sample_every{1}; + Cord src = UnsampledCord(MakeString(TestCordSize::kLarge)); + Cord cord = src.Subcord(10, src.size() / 2); + EXPECT_THAT(GetCordzInfoForTesting(cord), Eq(nullptr)); +} + +TEST(CordzTest, SubCordFromSampledCord) { + CordzSamplingIntervalHelper sample_never{99999}; + Cord src = MakeAppendStringCord(TestCordSize::kLarge); + Cord cord = src.Subcord(10, src.size() / 2); + ASSERT_THAT(cord, HasValidCordzInfoOf(Method::kSubCord)); + CordzStatistics stats = GetCordzInfoForTesting(cord)->GetCordzStatistics(); + EXPECT_THAT(stats.parent_method, Eq(Method::kAppendString)); + EXPECT_THAT(stats.update_tracker.Value(Method::kAppendString), Eq(1)); +} + +TEST(CordzTest, SmallSubCord) { + CordzSamplingIntervalHelper sample_never{99999}; + Cord src = MakeAppendStringCord(TestCordSize::kLarge); + Cord cord = src.Subcord(10, kMaxInline + 1); + EXPECT_THAT(cord, HasValidCordzInfoOf(Method::kSubCord)); +} + +} // namespace + +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_INTERNAL_CORDZ_ENABLED diff --git a/third_party/abseil-cpp/absl/strings/cordz_test_helpers.h b/third_party/abseil-cpp/absl/strings/cordz_test_helpers.h new file mode 100644 index 0000000000..e410eecf7f --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/cordz_test_helpers.h @@ -0,0 +1,151 @@ +// Copyright 2021 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_STRINGS_CORDZ_TEST_HELPERS_H_ +#define ABSL_STRINGS_CORDZ_TEST_HELPERS_H_ + +#include <utility> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/config.h" +#include "absl/base/macros.h" +#include "absl/strings/cord.h" +#include "absl/strings/internal/cord_internal.h" +#include "absl/strings/internal/cordz_info.h" +#include "absl/strings/internal/cordz_sample_token.h" +#include "absl/strings/internal/cordz_statistics.h" +#include "absl/strings/internal/cordz_update_tracker.h" +#include "absl/strings/str_cat.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN + +// Returns the CordzInfo for the cord, or nullptr if the cord is not sampled. +inline const cord_internal::CordzInfo* GetCordzInfoForTesting( + const Cord& cord) { + if (!cord.contents_.is_tree()) return nullptr; + return cord.contents_.cordz_info(); +} + +// Returns true if the provided cordz_info is in the list of sampled cords. +inline bool CordzInfoIsListed(const cord_internal::CordzInfo* cordz_info, + cord_internal::CordzSampleToken token = {}) { + for (const cord_internal::CordzInfo& info : token) { + if (cordz_info == &info) return true; + } + return false; +} + +// Matcher on Cord that verifies all of: +// - the cord is sampled +// - the CordzInfo of the cord is listed / discoverable. +// - the reported CordzStatistics match the cord's actual properties +// - the cord has an (initial) UpdateTracker count of 1 for `method` +MATCHER_P(HasValidCordzInfoOf, method, "CordzInfo matches cord") { + const cord_internal::CordzInfo* cord_info = GetCordzInfoForTesting(arg); + if (cord_info == nullptr) { + *result_listener << "cord is not sampled"; + return false; + } + if (!CordzInfoIsListed(cord_info)) { + *result_listener << "cord is sampled, but not listed"; + return false; + } + cord_internal::CordzStatistics stat = cord_info->GetCordzStatistics(); + if (stat.size != arg.size()) { + *result_listener << "cordz size " << stat.size + << " does not match cord size " << arg.size(); + return false; + } + if (stat.update_tracker.Value(method) != 1) { + *result_listener << "Expected method count 1 for " << method << ", found " + << stat.update_tracker.Value(method); + return false; + } + return true; +} + +// Matcher on Cord that verifies that the cord is sampled and that the CordzInfo +// update tracker has 'method' with a call count of 'n' +MATCHER_P2(CordzMethodCountEq, method, n, + absl::StrCat("CordzInfo method count equals ", n)) { + const cord_internal::CordzInfo* cord_info = GetCordzInfoForTesting(arg); + if (cord_info == nullptr) { + *result_listener << "cord is not sampled"; + return false; + } + cord_internal::CordzStatistics stat = cord_info->GetCordzStatistics(); + if (stat.update_tracker.Value(method) != n) { + *result_listener << "Expected method count " << n << " for " << method + << ", found " << stat.update_tracker.Value(method); + return false; + } + return true; +} + +// Cordz will only update with a new rate once the previously scheduled event +// has fired. When we disable Cordz, a long delay takes place where we won't +// consider profiling new Cords. CordzSampleIntervalHelper will burn through +// that interval and allow for testing that assumes that the average sampling +// interval is a particular value. +class CordzSamplingIntervalHelper { + public: + explicit CordzSamplingIntervalHelper(int32_t interval) + : orig_mean_interval_(absl::cord_internal::get_cordz_mean_interval()) { + absl::cord_internal::set_cordz_mean_interval(interval); + absl::cord_internal::cordz_set_next_sample_for_testing(interval); + } + + ~CordzSamplingIntervalHelper() { + absl::cord_internal::set_cordz_mean_interval(orig_mean_interval_); + absl::cord_internal::cordz_set_next_sample_for_testing(orig_mean_interval_); + } + + private: + int32_t orig_mean_interval_; +}; + +// Wrapper struct managing a small CordRep `rep` +struct TestCordRep { + cord_internal::CordRepFlat* rep; + + TestCordRep() { + rep = cord_internal::CordRepFlat::New(100); + rep->length = 100; + memset(rep->Data(), 1, 100); + } + ~TestCordRep() { cord_internal::CordRep::Unref(rep); } +}; + +// Wrapper struct managing a small CordRep `rep`, and +// an InlineData `data` initialized with that CordRep. +struct TestCordData { + TestCordRep rep; + cord_internal::InlineData data{rep.rep}; +}; + +// Creates a Cord that is not sampled +template <typename... Args> +Cord UnsampledCord(Args... args) { + CordzSamplingIntervalHelper never(9999); + Cord cord(std::forward<Args>(args)...); + ABSL_ASSERT(GetCordzInfoForTesting(cord) == nullptr); + return cord; +} + +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_CORDZ_TEST_HELPERS_H_ diff --git a/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc b/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc index 8b11868c88..d29acaf462 100644 --- a/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc +++ b/third_party/abseil-cpp/absl/strings/internal/charconv_parse.cc @@ -52,7 +52,7 @@ static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact"); // The lowest valued 19-digit decimal mantissa we can read still contains // sufficient information to reconstruct a binary mantissa. -static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above"); +static_assert(1000000000000000000u > (uint64_t{1} << (53 + 3)), "(b) above"); // ParseFloat<16> will read the first 15 significant digits of the mantissa. // diff --git a/third_party/abseil-cpp/absl/strings/internal/cord_internal.h b/third_party/abseil-cpp/absl/strings/internal/cord_internal.h index a1ba67fec3..813b3f3527 100644 --- a/third_party/abseil-cpp/absl/strings/internal/cord_internal.h +++ b/third_party/abseil-cpp/absl/strings/internal/cord_internal.h @@ -329,18 +329,17 @@ static constexpr cordz_info_t BigEndianByte(unsigned char value) { class InlineData { public: + // DefaultInitType forces the use of the default initialization constructor. + enum DefaultInitType { kDefaultInit }; + // kNullCordzInfo holds the big endian representation of intptr_t(1) // This is the 'null' / initial value of 'cordz_info'. The null value // is specifically big endian 1 as with 64-bit pointers, the last // byte of cordz_info overlaps with the last byte holding the tag. static constexpr cordz_info_t kNullCordzInfo = BigEndianByte(1); - // kFakeCordzInfo holds a 'fake', non-null cordz-info value we use to - // emulate the previous 'kProfiled' tag logic in 'set_profiled' until - // cord code is changed to store cordz_info values in InlineData. - static constexpr cordz_info_t kFakeCordzInfo = BigEndianByte(9); - constexpr InlineData() : as_chars_{0} {} + explicit InlineData(DefaultInitType) {} explicit constexpr InlineData(CordRep* rep) : as_tree_(rep) {} explicit constexpr InlineData(absl::string_view chars) : as_chars_{ @@ -367,6 +366,16 @@ class InlineData { return as_tree_.cordz_info != kNullCordzInfo; } + // Returns true if either of the provided instances hold a cordz_info value. + // This method is more efficient than the equivalent `data1.is_profiled() || + // data2.is_profiled()`. Requires both arguments to hold a tree. + static bool is_either_profiled(const InlineData& data1, + const InlineData& data2) { + assert(data1.is_tree() && data2.is_tree()); + return (data1.as_tree_.cordz_info | data2.as_tree_.cordz_info) != + kNullCordzInfo; + } + // Returns the cordz_info sampling instance for this instance, or nullptr // if the current instance is not sampled and does not have CordzInfo data. // Requires the current instance to hold a tree value. @@ -454,13 +463,6 @@ class InlineData { tag() = static_cast<char>(size << 1); } - // Sets or unsets the 'is_profiled' state of this instance. - // Requires the current instance to hold a tree value. - void set_profiled(bool profiled) { - assert(is_tree()); - as_tree_.cordz_info = profiled ? kFakeCordzInfo : kNullCordzInfo; - } - private: // See cordz_info_t for forced alignment and size of `cordz_info` details. struct AsTree { diff --git a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.cc b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.cc index 4d31d1d97c..f78c94e19b 100644 --- a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.cc +++ b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.cc @@ -32,15 +32,6 @@ namespace absl { ABSL_NAMESPACE_BEGIN namespace cord_internal { -// See https://bugs.llvm.org/show_bug.cgi?id=48477 -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wshadow" -#if __has_warning("-Wshadow-field") -#pragma clang diagnostic ignored "-Wshadow-field" -#endif -#endif - namespace { using index_type = CordRepRing::index_type; @@ -301,7 +292,7 @@ bool CordRepRing::IsValid(std::ostream& output) const { if (offset >= child->length || entry_length > child->length - offset) { output << "entry[" << head << "] has offset " << offset << " and entry length " << entry_length - << " which are outside of the childs length of " << child->length; + << " which are outside of the child's length of " << child->length; return false; } @@ -400,10 +391,11 @@ CordRepRing* CordRepRing::Mutable(CordRepRing* rep, size_t extra) { // Get current number of entries, and check for max capacity. size_t entries = rep->entries(); - size_t min_extra = (std::max)(extra, rep->capacity() * 2 - entries); if (!rep->refcount.IsOne()) { - return Copy(rep, rep->head(), rep->tail(), min_extra); + return Copy(rep, rep->head(), rep->tail(), extra); } else if (entries + extra > rep->capacity()) { + const size_t min_grow = rep->capacity() + rep->capacity() / 2; + const size_t min_extra = (std::max)(extra, min_grow - entries); CordRepRing* newrep = CordRepRing::New(entries, min_extra); newrep->Fill<false>(rep, rep->head(), rep->tail()); CordRepRing::Delete(rep); @@ -449,12 +441,12 @@ Span<char> CordRepRing::GetPrependBuffer(size_t size) { } CordRepRing* CordRepRing::CreateFromLeaf(CordRep* child, size_t offset, - size_t length, size_t extra) { + size_t len, size_t extra) { CordRepRing* rep = CordRepRing::New(1, extra); rep->head_ = 0; rep->tail_ = rep->advance(0); - rep->length = length; - rep->entry_end_pos()[0] = length; + rep->length = len; + rep->entry_end_pos()[0] = len; rep->entry_child()[0] = child; rep->entry_data_offset()[0] = static_cast<offset_type>(offset); return Validate(rep); @@ -462,16 +454,16 @@ CordRepRing* CordRepRing::CreateFromLeaf(CordRep* child, size_t offset, CordRepRing* CordRepRing::CreateSlow(CordRep* child, size_t extra) { CordRepRing* rep = nullptr; - Consume(child, [&](CordRep* child, size_t offset, size_t length) { - if (IsFlatOrExternal(child)) { - rep = rep ? AppendLeaf(rep, child, offset, length) - : CreateFromLeaf(child, offset, length, extra); + Consume(child, [&](CordRep* child_arg, size_t offset, size_t len) { + if (IsFlatOrExternal(child_arg)) { + rep = rep ? AppendLeaf(rep, child_arg, offset, len) + : CreateFromLeaf(child_arg, offset, len, extra); } else if (rep) { - rep = AddRing<AddMode::kAppend>(rep, child->ring(), offset, length); - } else if (offset == 0 && child->length == length) { - rep = Mutable(child->ring(), extra); + rep = AddRing<AddMode::kAppend>(rep, child_arg->ring(), offset, len); + } else if (offset == 0 && child_arg->length == len) { + rep = Mutable(child_arg->ring(), extra); } else { - rep = SubRing(child->ring(), offset, length, extra); + rep = SubRing(child_arg->ring(), offset, len, extra); } }); return Validate(rep, nullptr, __LINE__); @@ -490,18 +482,18 @@ CordRepRing* CordRepRing::Create(CordRep* child, size_t extra) { template <CordRepRing::AddMode mode> CordRepRing* CordRepRing::AddRing(CordRepRing* rep, CordRepRing* ring, - size_t offset, size_t length) { + size_t offset, size_t len) { assert(offset < ring->length); constexpr bool append = mode == AddMode::kAppend; Position head = ring->Find(offset); - Position tail = ring->FindTail(head.index, offset + length); + Position tail = ring->FindTail(head.index, offset + len); const index_type entries = ring->entries(head.index, tail.index); rep = Mutable(rep, entries); // The delta for making ring[head].end_pos into 'len - offset' const pos_type delta_length = - (append ? rep->begin_pos_ + rep->length : rep->begin_pos_ - length) - + (append ? rep->begin_pos_ + rep->length : rep->begin_pos_ - len) - ring->entry_begin_pos(head.index) - head.offset; // Start filling at `tail`, or `entries` before `head` @@ -542,36 +534,36 @@ CordRepRing* CordRepRing::AddRing(CordRepRing* rep, CordRepRing* ring, } // Commit changes - rep->length += length; + rep->length += len; if (append) { rep->tail_ = filler.pos(); } else { rep->head_ = filler.head(); - rep->begin_pos_ -= length; + rep->begin_pos_ -= len; } return Validate(rep); } CordRepRing* CordRepRing::AppendSlow(CordRepRing* rep, CordRep* child) { - Consume(child, [&rep](CordRep* child, size_t offset, size_t length) { - if (child->tag == RING) { - rep = AddRing<AddMode::kAppend>(rep, child->ring(), offset, length); + Consume(child, [&rep](CordRep* child_arg, size_t offset, size_t len) { + if (child_arg->tag == RING) { + rep = AddRing<AddMode::kAppend>(rep, child_arg->ring(), offset, len); } else { - rep = AppendLeaf(rep, child, offset, length); + rep = AppendLeaf(rep, child_arg, offset, len); } }); return rep; } CordRepRing* CordRepRing::AppendLeaf(CordRepRing* rep, CordRep* child, - size_t offset, size_t length) { + size_t offset, size_t len) { rep = Mutable(rep, 1); index_type back = rep->tail_; const pos_type begin_pos = rep->begin_pos_ + rep->length; rep->tail_ = rep->advance(rep->tail_); - rep->length += length; - rep->entry_end_pos()[back] = begin_pos + length; + rep->length += len; + rep->entry_end_pos()[back] = begin_pos + len; rep->entry_child()[back] = child; rep->entry_data_offset()[back] = static_cast<offset_type>(offset); return Validate(rep, nullptr, __LINE__); @@ -589,24 +581,24 @@ CordRepRing* CordRepRing::Append(CordRepRing* rep, CordRep* child) { } CordRepRing* CordRepRing::PrependSlow(CordRepRing* rep, CordRep* child) { - RConsume(child, [&](CordRep* child, size_t offset, size_t length) { - if (IsFlatOrExternal(child)) { - rep = PrependLeaf(rep, child, offset, length); + RConsume(child, [&](CordRep* child_arg, size_t offset, size_t len) { + if (IsFlatOrExternal(child_arg)) { + rep = PrependLeaf(rep, child_arg, offset, len); } else { - rep = AddRing<AddMode::kPrepend>(rep, child->ring(), offset, length); + rep = AddRing<AddMode::kPrepend>(rep, child_arg->ring(), offset, len); } }); return Validate(rep); } CordRepRing* CordRepRing::PrependLeaf(CordRepRing* rep, CordRep* child, - size_t offset, size_t length) { + size_t offset, size_t len) { rep = Mutable(rep, 1); index_type head = rep->retreat(rep->head_); pos_type end_pos = rep->begin_pos_; rep->head_ = head; - rep->length += length; - rep->begin_pos_ -= length; + rep->length += len; + rep->begin_pos_ -= len; rep->entry_end_pos()[head] = end_pos; rep->entry_child()[head] = child; rep->entry_data_offset()[head] = static_cast<offset_type>(offset); @@ -786,18 +778,18 @@ char CordRepRing::GetCharacter(size_t offset) const { } CordRepRing* CordRepRing::SubRing(CordRepRing* rep, size_t offset, - size_t length, size_t extra) { + size_t len, size_t extra) { assert(offset <= rep->length); - assert(offset <= rep->length - length); + assert(offset <= rep->length - len); - if (length == 0) { + if (len == 0) { CordRep::Unref(rep); return nullptr; } // Find position of first byte Position head = rep->Find(offset); - Position tail = rep->FindTail(head.index, offset + length); + Position tail = rep->FindTail(head.index, offset + len); const size_t new_entries = rep->entries(head.index, tail.index); if (rep->refcount.IsOne() && extra <= (rep->capacity() - new_entries)) { @@ -814,7 +806,7 @@ CordRepRing* CordRepRing::SubRing(CordRepRing* rep, size_t offset, } // Adjust begin_pos and length - rep->length = length; + rep->length = len; rep->begin_pos_ += offset; // Adjust head and tail blocks @@ -888,10 +880,6 @@ CordRepRing* CordRepRing::RemoveSuffix(CordRepRing* rep, size_t len, return Validate(rep); } -#ifdef __clang__ -#pragma clang diagnostic pop -#endif - } // namespace cord_internal ABSL_NAMESPACE_END } // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.h b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.h index c74d3353ff..2082a5653f 100644 --- a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.h +++ b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring.h @@ -30,15 +30,6 @@ namespace absl { ABSL_NAMESPACE_BEGIN namespace cord_internal { -// See https://bugs.llvm.org/show_bug.cgi?id=48477 -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wshadow" -#if __has_warning("-Wshadow-field") -#pragma clang diagnostic ignored "-Wshadow-field" -#endif -#endif - // All operations modifying a ring buffer are implemented as static methods // requiring a CordRepRing instance with a reference adopted by the method. // @@ -210,23 +201,23 @@ class CordRepRing : public CordRep { // referencing up to `size` capacity directly before the existing data. Span<char> GetPrependBuffer(size_t size); - // Returns a cord ring buffer containing `length` bytes of data starting at + // Returns a cord ring buffer containing `len` bytes of data starting at // `offset`. If the input is not shared, this function will remove all head // and tail child nodes outside of the requested range, and adjust the new // head and tail nodes as required. If the input is shared, this function // returns a new instance sharing some or all of the nodes from the input. - static CordRepRing* SubRing(CordRepRing* r, size_t offset, size_t length, + static CordRepRing* SubRing(CordRepRing* r, size_t offset, size_t len, size_t extra = 0); - // Returns a cord ring buffer with the first `length` bytes removed. + // Returns a cord ring buffer with the first `len` bytes removed. // If the input is not shared, this function will remove all head child nodes // fully inside the first `length` bytes, and adjust the new head as required. // If the input is shared, this function returns a new instance sharing some // or all of the nodes from the input. - static CordRepRing* RemoveSuffix(CordRepRing* r, size_t length, + static CordRepRing* RemoveSuffix(CordRepRing* r, size_t len, size_t extra = 0); - // Returns a cord ring buffer with the last `length` bytes removed. + // Returns a cord ring buffer with the last `len` bytes removed. // If the input is not shared, this function will remove all head child nodes // fully inside the first `length` bytes, and adjust the new head as required. // If the input is shared, this function returns a new instance sharing some @@ -237,6 +228,18 @@ class CordRepRing : public CordRep { // Returns the character at `offset`. Requires that `offset < length`. char GetCharacter(size_t offset) const; + // Returns true if this instance manages a single contiguous buffer, in which + // case the (optional) output parameter `fragment` is set. Otherwise, the + // function returns false, and `fragment` is left unchanged. + bool IsFlat(absl::string_view* fragment) const; + + // Returns true if the data starting at `offset` with length `len` is + // managed by this instance inside a single contiguous buffer, in which case + // the (optional) output parameter `fragment` is set to the contiguous memory + // starting at offset `offset` with length `length`. Otherwise, the function + // returns false, and `fragment` is left unchanged. + bool IsFlat(size_t offset, size_t len, absl::string_view* fragment) const; + // Testing only: set capacity to requested capacity. void SetCapacityForTesting(size_t capacity); @@ -461,10 +464,10 @@ class CordRepRing : public CordRep { size_t length, size_t extra); // Appends or prepends (depending on AddMode) the ring buffer in `ring' to - // `rep` starting at `offset` with length `length`. + // `rep` starting at `offset` with length `len`. template <AddMode mode> static CordRepRing* AddRing(CordRepRing* rep, CordRepRing* ring, - size_t offset, size_t length); + size_t offset, size_t len); // Increases the data offset for entry `index` by `n`. void AddDataOffset(index_type index, size_t n); @@ -576,11 +579,26 @@ inline const CordRepRing* CordRep::ring() const { return static_cast<const CordRepRing*>(this); } -std::ostream& operator<<(std::ostream& s, const CordRepRing& rep); +inline bool CordRepRing::IsFlat(absl::string_view* fragment) const { + if (entries() == 1) { + if (fragment) *fragment = entry_data(head()); + return true; + } + return false; +} -#ifdef __clang__ -#pragma clang diagnostic pop -#endif +inline bool CordRepRing::IsFlat(size_t offset, size_t len, + absl::string_view* fragment) const { + const Position pos = Find(offset); + const absl::string_view data = entry_data(pos.index); + if (data.length() >= len && data.length() - len >= pos.offset) { + if (fragment) *fragment = data.substr(pos.offset, len); + return true; + } + return false; +} + +std::ostream& operator<<(std::ostream& s, const CordRepRing& rep); } // namespace cord_internal ABSL_NAMESPACE_END diff --git a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring_reader.h b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring_reader.h index 396c0e2cd8..7ceeaa000e 100644 --- a/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring_reader.h +++ b/third_party/abseil-cpp/absl/strings/internal/cord_rep_ring_reader.h @@ -40,6 +40,10 @@ class CordRepRingReader { // The returned value is undefined if this instance is empty. CordRepRing::index_type index() const { return index_; } + // Returns the current node inside the ring buffer for this instance. + // The returned value is undefined if this instance is empty. + CordRep* node() const { return ring_->entry_child(index_); } + // Returns the length of the referenced ring buffer. // Requires the current instance to be non empty. size_t length() const { diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_functions.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_functions.cc new file mode 100644 index 0000000000..f30080f8c2 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_functions.cc @@ -0,0 +1,110 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/cordz_functions.h" + +#include <atomic> +#include <cmath> +#include <limits> +#include <random> + +#include "absl/base/attributes.h" +#include "absl/base/config.h" +#include "absl/base/internal/exponential_biased.h" +#include "absl/base/internal/raw_logging.h" + +// TODO(b/162942788): weak 'cordz_disabled' value. +// A strong version is in the 'cordz_disabled_hack_for_odr' library which can +// be linked in to disable cordz at compile time. +extern "C" { +bool absl_internal_cordz_disabled ABSL_ATTRIBUTE_WEAK = false; +} + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { +namespace { + +// The average interval until the next sample. A value of 0 disables profiling +// while a value of 1 will profile all Cords. +std::atomic<int> g_cordz_mean_interval(50000); + +} // namespace + +#ifdef ABSL_INTERNAL_CORDZ_ENABLED + +// Special negative 'not initialized' per thread value for cordz_next_sample. +static constexpr int64_t kInitCordzNextSample = -1; + +ABSL_CONST_INIT thread_local int64_t cordz_next_sample = kInitCordzNextSample; + +// kIntervalIfDisabled is the number of profile-eligible events need to occur +// before the code will confirm that cordz is still disabled. +constexpr int64_t kIntervalIfDisabled = 1 << 16; + +ABSL_ATTRIBUTE_NOINLINE bool cordz_should_profile_slow() { + // TODO(b/162942788): check if profiling is disabled at compile time. + if (absl_internal_cordz_disabled) { + ABSL_RAW_LOG(WARNING, "Cordz info disabled at compile time"); + // We are permanently disabled: set counter to highest possible value. + cordz_next_sample = std::numeric_limits<int64_t>::max(); + return false; + } + + thread_local absl::base_internal::ExponentialBiased + exponential_biased_generator; + int32_t mean_interval = get_cordz_mean_interval(); + + // Check if we disabled profiling. If so, set the next sample to a "large" + // number to minimize the overhead of the should_profile codepath. + if (mean_interval <= 0) { + cordz_next_sample = kIntervalIfDisabled; + return false; + } + + // Check if we're always sampling. + if (mean_interval == 1) { + cordz_next_sample = 1; + return true; + } + + if (cordz_next_sample <= 0) { + // If first check on current thread, check cordz_should_profile() + // again using the created (initial) stride in cordz_next_sample. + const bool initialized = cordz_next_sample != kInitCordzNextSample; + cordz_next_sample = exponential_biased_generator.GetStride(mean_interval); + return initialized || cordz_should_profile(); + } + + --cordz_next_sample; + return false; +} + +void cordz_set_next_sample_for_testing(int64_t next_sample) { + cordz_next_sample = next_sample; +} + +#endif // ABSL_INTERNAL_CORDZ_ENABLED + +int32_t get_cordz_mean_interval() { + return g_cordz_mean_interval.load(std::memory_order_acquire); +} + +void set_cordz_mean_interval(int32_t mean_interval) { + g_cordz_mean_interval.store(mean_interval, std::memory_order_release); +} + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_functions.h b/third_party/abseil-cpp/absl/strings/internal/cordz_functions.h new file mode 100644 index 0000000000..c9ba14508a --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_functions.h @@ -0,0 +1,85 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_STRINGS_CORDZ_FUNCTIONS_H_ +#define ABSL_STRINGS_CORDZ_FUNCTIONS_H_ + +#include <stdint.h> + +#include "absl/base/attributes.h" +#include "absl/base/config.h" +#include "absl/base/optimization.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +// Returns the current sample rate. This represents the average interval +// between samples. +int32_t get_cordz_mean_interval(); + +// Sets the sample rate with the average interval between samples. +void set_cordz_mean_interval(int32_t mean_interval); + +// Enable cordz unless any of the following applies: +// - no thread local support +// - MSVC build +// - Android build +// - Apple build +// - DLL build +// Hashtablez is turned off completely in opensource builds. +// MSVC's static atomics are dynamically initialized in debug mode, which breaks +// sampling. +#if defined(ABSL_HAVE_THREAD_LOCAL) && !defined(_MSC_VER) && \ + !defined(ABSL_BUILD_DLL) && !defined(ABSL_CONSUME_DLL) && \ + !defined(__ANDROID__) && !defined(__APPLE__) +#define ABSL_INTERNAL_CORDZ_ENABLED 1 +#endif + +#ifdef ABSL_INTERNAL_CORDZ_ENABLED + +// cordz_next_sample is the number of events until the next sample event. If +// the value is 1 or less, the code will check on the next event if cordz is +// enabled, and if so, will sample the Cord. cordz is only enabled when we can +// use thread locals. +ABSL_CONST_INIT extern thread_local int64_t cordz_next_sample; + +// Determines if the next sample should be profiled. If it is, the value pointed +// at by next_sample will be set with the interval until the next sample. +bool cordz_should_profile_slow(); + +// Returns true if the next cord should be sampled. +inline bool cordz_should_profile() { + if (ABSL_PREDICT_TRUE(cordz_next_sample > 1)) { + cordz_next_sample--; + return false; + } + return cordz_should_profile_slow(); +} + +// Sets the interval until the next sample (for testing only) +void cordz_set_next_sample_for_testing(int64_t next_sample); + +#else // ABSL_INTERNAL_CORDZ_ENABLED + +inline bool cordz_should_profile() { return false; } +inline void cordz_set_next_sample_for_testing(int64_t) {} + +#endif // ABSL_INTERNAL_CORDZ_ENABLED + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_CORDZ_FUNCTIONS_H_ diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_functions_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_functions_test.cc new file mode 100644 index 0000000000..350623c1f3 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_functions_test.cc @@ -0,0 +1,149 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/cordz_functions.h" + +#include <thread> // NOLINT we need real clean new threads + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/config.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { +namespace { + +using ::testing::Eq; +using ::testing::Ge; +using ::testing::Le; + +TEST(CordzFunctionsTest, SampleRate) { + int32_t orig_sample_rate = get_cordz_mean_interval(); + int32_t expected_sample_rate = 123; + set_cordz_mean_interval(expected_sample_rate); + EXPECT_THAT(get_cordz_mean_interval(), Eq(expected_sample_rate)); + set_cordz_mean_interval(orig_sample_rate); +} + +// Cordz is disabled when we don't have thread_local. All calls to +// should_profile will return false when cordz is diabled, so we might want to +// avoid those tests. +#ifdef ABSL_INTERNAL_CORDZ_ENABLED + +TEST(CordzFunctionsTest, ShouldProfileDisable) { + int32_t orig_sample_rate = get_cordz_mean_interval(); + + set_cordz_mean_interval(0); + cordz_set_next_sample_for_testing(0); + EXPECT_FALSE(cordz_should_profile()); + // 1 << 16 is from kIntervalIfDisabled in cordz_functions.cc. + EXPECT_THAT(cordz_next_sample, Eq(1 << 16)); + + set_cordz_mean_interval(orig_sample_rate); +} + +TEST(CordzFunctionsTest, ShouldProfileAlways) { + int32_t orig_sample_rate = get_cordz_mean_interval(); + + set_cordz_mean_interval(1); + cordz_set_next_sample_for_testing(1); + EXPECT_TRUE(cordz_should_profile()); + EXPECT_THAT(cordz_next_sample, Le(1)); + + set_cordz_mean_interval(orig_sample_rate); +} + +TEST(CordzFunctionsTest, DoesNotAlwaysSampleFirstCord) { + // Set large enough interval such that the chance of 'tons' of threads + // randomly sampling the first call is infinitely small. + set_cordz_mean_interval(10000); + int tries = 0; + bool sampled = false; + do { + ++tries; + ASSERT_THAT(tries, Le(1000)); + std::thread thread([&sampled] { + sampled = cordz_should_profile(); + }); + thread.join(); + } while (sampled); +} + +TEST(CordzFunctionsTest, ShouldProfileRate) { + static constexpr int kDesiredMeanInterval = 1000; + static constexpr int kSamples = 10000; + int32_t orig_sample_rate = get_cordz_mean_interval(); + + set_cordz_mean_interval(kDesiredMeanInterval); + + int64_t sum_of_intervals = 0; + for (int i = 0; i < kSamples; i++) { + // Setting next_sample to 0 will force cordz_should_profile to generate a + // new value for next_sample each iteration. + cordz_set_next_sample_for_testing(0); + cordz_should_profile(); + sum_of_intervals += cordz_next_sample; + } + + // The sum of independent exponential variables is an Erlang distribution, + // which is a gamma distribution where the shape parameter is equal to the + // number of summands. The distribution used for cordz_should_profile is + // actually floor(Exponential(1/mean)) which introduces bias. However, we can + // apply the squint-really-hard correction factor. That is, when mean is + // large, then if we squint really hard the shape of the distribution between + // N and N+1 looks like a uniform distribution. On average, each value for + // next_sample will be about 0.5 lower than we would expect from an + // exponential distribution. This squint-really-hard correction approach won't + // work when mean is smaller than about 10 but works fine when mean is 1000. + // + // We can use R to calculate a confidence interval. This + // shows how to generate a confidence interval with a false positive rate of + // one in a billion. + // + // $ R -q + // > mean = 1000 + // > kSamples = 10000 + // > errorRate = 1e-9 + // > correction = -kSamples / 2 + // > low = qgamma(errorRate/2, kSamples, 1/mean) + correction + // > high = qgamma(1 - errorRate/2, kSamples, 1/mean) + correction + // > low + // [1] 9396115 + // > high + // [1] 10618100 + EXPECT_THAT(sum_of_intervals, Ge(9396115)); + EXPECT_THAT(sum_of_intervals, Le(10618100)); + + set_cordz_mean_interval(orig_sample_rate); +} + +#else // ABSL_INTERNAL_CORDZ_ENABLED + +TEST(CordzFunctionsTest, ShouldProfileDisabled) { + int32_t orig_sample_rate = get_cordz_mean_interval(); + + set_cordz_mean_interval(1); + cordz_set_next_sample_for_testing(0); + EXPECT_FALSE(cordz_should_profile()); + + set_cordz_mean_interval(orig_sample_rate); +} + +#endif // ABSL_INTERNAL_CORDZ_ENABLED + +} // namespace +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_handle.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_handle.cc new file mode 100644 index 0000000000..a73fefed59 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_handle.cc @@ -0,0 +1,139 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "absl/strings/internal/cordz_handle.h" + +#include <atomic> + +#include "absl/base/internal/raw_logging.h" // For ABSL_RAW_CHECK +#include "absl/base/internal/spinlock.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +using ::absl::base_internal::SpinLockHolder; + +ABSL_CONST_INIT CordzHandle::Queue CordzHandle::global_queue_(absl::kConstInit); + +CordzHandle::CordzHandle(bool is_snapshot) : is_snapshot_(is_snapshot) { + if (is_snapshot) { + SpinLockHolder lock(&queue_->mutex); + CordzHandle* dq_tail = queue_->dq_tail.load(std::memory_order_acquire); + if (dq_tail != nullptr) { + dq_prev_ = dq_tail; + dq_tail->dq_next_ = this; + } + queue_->dq_tail.store(this, std::memory_order_release); + } +} + +CordzHandle::~CordzHandle() { + ODRCheck(); + if (is_snapshot_) { + std::vector<CordzHandle*> to_delete; + { + SpinLockHolder lock(&queue_->mutex); + CordzHandle* next = dq_next_; + if (dq_prev_ == nullptr) { + // We were head of the queue, delete every CordzHandle until we reach + // either the end of the list, or a snapshot handle. + while (next && !next->is_snapshot_) { + to_delete.push_back(next); + next = next->dq_next_; + } + } else { + // Another CordzHandle existed before this one, don't delete anything. + dq_prev_->dq_next_ = next; + } + if (next) { + next->dq_prev_ = dq_prev_; + } else { + queue_->dq_tail.store(dq_prev_, std::memory_order_release); + } + } + for (CordzHandle* handle : to_delete) { + delete handle; + } + } +} + +bool CordzHandle::SafeToDelete() const { + return is_snapshot_ || queue_->IsEmpty(); +} + +void CordzHandle::Delete(CordzHandle* handle) { + assert(handle); + if (handle) { + handle->ODRCheck(); + Queue* const queue = handle->queue_; + if (!handle->SafeToDelete()) { + SpinLockHolder lock(&queue->mutex); + CordzHandle* dq_tail = queue->dq_tail.load(std::memory_order_acquire); + if (dq_tail != nullptr) { + handle->dq_prev_ = dq_tail; + dq_tail->dq_next_ = handle; + queue->dq_tail.store(handle, std::memory_order_release); + return; + } + } + delete handle; + } +} + +std::vector<const CordzHandle*> CordzHandle::DiagnosticsGetDeleteQueue() { + std::vector<const CordzHandle*> handles; + SpinLockHolder lock(&global_queue_.mutex); + CordzHandle* dq_tail = global_queue_.dq_tail.load(std::memory_order_acquire); + for (const CordzHandle* p = dq_tail; p; p = p->dq_prev_) { + handles.push_back(p); + } + return handles; +} + +bool CordzHandle::DiagnosticsHandleIsSafeToInspect( + const CordzHandle* handle) const { + ODRCheck(); + if (!is_snapshot_) return false; + if (handle == nullptr) return true; + if (handle->is_snapshot_) return false; + bool snapshot_found = false; + SpinLockHolder lock(&queue_->mutex); + for (const CordzHandle* p = queue_->dq_tail; p; p = p->dq_prev_) { + if (p == handle) return !snapshot_found; + if (p == this) snapshot_found = true; + } + ABSL_ASSERT(snapshot_found); // Assert that 'this' is in delete queue. + return true; +} + +std::vector<const CordzHandle*> +CordzHandle::DiagnosticsGetSafeToInspectDeletedHandles() { + ODRCheck(); + std::vector<const CordzHandle*> handles; + if (!is_snapshot()) { + return handles; + } + + SpinLockHolder lock(&queue_->mutex); + for (const CordzHandle* p = dq_next_; p != nullptr; p = p->dq_next_) { + if (!p->is_snapshot()) { + handles.push_back(p); + } + } + return handles; +} + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_handle.h b/third_party/abseil-cpp/absl/strings/internal/cordz_handle.h new file mode 100644 index 0000000000..5df53c782a --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_handle.h @@ -0,0 +1,131 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_STRINGS_CORDZ_HANDLE_H_ +#define ABSL_STRINGS_CORDZ_HANDLE_H_ + +#include <atomic> +#include <vector> + +#include "absl/base/config.h" +#include "absl/base/internal/raw_logging.h" +#include "absl/base/internal/spinlock.h" +#include "absl/synchronization/mutex.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +// This base class allows multiple types of object (CordzInfo and +// CordzSampleToken) to exist simultaneously on the delete queue (pointed to by +// global_dq_tail and traversed using dq_prev_ and dq_next_). The +// delete queue guarantees that once a profiler creates a CordzSampleToken and +// has gained visibility into a CordzInfo object, that CordzInfo object will not +// be deleted prematurely. This allows the profiler to inspect all CordzInfo +// objects that are alive without needing to hold a global lock. +class CordzHandle { + public: + CordzHandle() : CordzHandle(false) {} + + bool is_snapshot() const { return is_snapshot_; } + + // Returns true if this instance is safe to be deleted because it is either a + // snapshot, which is always safe to delete, or not included in the global + // delete queue and thus not included in any snapshot. + // Callers are responsible for making sure this instance can not be newly + // discovered by other threads. For example, CordzInfo instances first de-list + // themselves from the global CordzInfo list before determining if they are + // safe to be deleted directly. + // If SafeToDelete returns false, callers MUST use the Delete() method to + // safely queue CordzHandle instances for deletion. + bool SafeToDelete() const; + + // Deletes the provided instance, or puts it on the delete queue to be deleted + // once there are no more sample tokens (snapshot) instances potentially + // referencing the instance. `handle` should not be null. + static void Delete(CordzHandle* handle); + + // Returns the current entries in the delete queue in LIFO order. + static std::vector<const CordzHandle*> DiagnosticsGetDeleteQueue(); + + // Returns true if the provided handle is nullptr or guarded by this handle. + // Since the CordzSnapshot token is itself a CordzHandle, this method will + // allow tests to check if that token is keeping an arbitrary CordzHandle + // alive. + bool DiagnosticsHandleIsSafeToInspect(const CordzHandle* handle) const; + + // Returns the current entries in the delete queue, in LIFO order, that are + // protected by this. CordzHandle objects are only placed on the delete queue + // after CordzHandle::Delete is called with them as an argument. Only + // CordzHandle objects that are not also CordzSnapshot objects will be + // included in the return vector. For each of the handles in the return + // vector, the earliest that their memory can be freed is when this + // CordzSnapshot object is deleted. + std::vector<const CordzHandle*> DiagnosticsGetSafeToInspectDeletedHandles(); + + protected: + explicit CordzHandle(bool is_snapshot); + virtual ~CordzHandle(); + + private: + // Global queue data. CordzHandle stores a pointer to the global queue + // instance to harden against ODR violations. + struct Queue { + constexpr explicit Queue(absl::ConstInitType) + : mutex(absl::kConstInit, + absl::base_internal::SCHEDULE_COOPERATIVE_AND_KERNEL) {} + + absl::base_internal::SpinLock mutex; + std::atomic<CordzHandle*> dq_tail ABSL_GUARDED_BY(mutex){nullptr}; + + // Returns true if this delete queue is empty. This method does not acquire + // the lock, but does a 'load acquire' observation on the delete queue tail. + // It is used inside Delete() to check for the presence of a delete queue + // without holding the lock. The assumption is that the caller is in the + // state of 'being deleted', and can not be newly discovered by a concurrent + // 'being constructed' snapshot instance. Practically, this means that any + // such discovery (`find`, 'first' or 'next', etc) must have proper 'happens + // before / after' semantics and atomic fences. + bool IsEmpty() const ABSL_NO_THREAD_SAFETY_ANALYSIS { + return dq_tail.load(std::memory_order_acquire) == nullptr; + } + }; + + void ODRCheck() const { +#ifndef NDEBUG + ABSL_RAW_CHECK(queue_ == &global_queue_, "ODR violation in Cord"); +#endif + } + + ABSL_CONST_INIT static Queue global_queue_; + Queue* const queue_ = &global_queue_; + const bool is_snapshot_; + + // dq_prev_ and dq_next_ require the global queue mutex to be held. + // Unfortunately we can't use thread annotations such that the thread safety + // analysis understands that queue_ and global_queue_ are one and the same. + CordzHandle* dq_prev_ = nullptr; + CordzHandle* dq_next_ = nullptr; +}; + +class CordzSnapshot : public CordzHandle { + public: + CordzSnapshot() : CordzHandle(true) {} +}; + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_CORDZ_HANDLE_H_ diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_handle_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_handle_test.cc new file mode 100644 index 0000000000..fd68e06b3e --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_handle_test.cc @@ -0,0 +1,265 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "absl/strings/internal/cordz_handle.h" + +#include <random> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/memory/memory.h" +#include "absl/synchronization/internal/thread_pool.h" +#include "absl/synchronization/notification.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { +namespace { + +using ::testing::ElementsAre; +using ::testing::Gt; +using ::testing::IsEmpty; +using ::testing::SizeIs; + +// Local less verbose helper +std::vector<const CordzHandle*> DeleteQueue() { + return CordzHandle::DiagnosticsGetDeleteQueue(); +} + +struct CordzHandleDeleteTracker : public CordzHandle { + bool* deleted; + explicit CordzHandleDeleteTracker(bool* deleted) : deleted(deleted) {} + ~CordzHandleDeleteTracker() override { *deleted = true; } +}; + +TEST(CordzHandleTest, DeleteQueueIsEmpty) { + EXPECT_THAT(DeleteQueue(), SizeIs(0)); +} + +TEST(CordzHandleTest, CordzHandleCreateDelete) { + bool deleted = false; + auto* handle = new CordzHandleDeleteTracker(&deleted); + EXPECT_FALSE(handle->is_snapshot()); + EXPECT_TRUE(handle->SafeToDelete()); + EXPECT_THAT(DeleteQueue(), SizeIs(0)); + + CordzHandle::Delete(handle); + EXPECT_THAT(DeleteQueue(), SizeIs(0)); + EXPECT_TRUE(deleted); +} + +TEST(CordzHandleTest, CordzSnapshotCreateDelete) { + auto* snapshot = new CordzSnapshot(); + EXPECT_TRUE(snapshot->is_snapshot()); + EXPECT_TRUE(snapshot->SafeToDelete()); + EXPECT_THAT(DeleteQueue(), ElementsAre(snapshot)); + delete snapshot; + EXPECT_THAT(DeleteQueue(), SizeIs(0)); +} + +TEST(CordzHandleTest, CordzHandleCreateDeleteWithSnapshot) { + bool deleted = false; + auto* snapshot = new CordzSnapshot(); + auto* handle = new CordzHandleDeleteTracker(&deleted); + EXPECT_FALSE(handle->SafeToDelete()); + + CordzHandle::Delete(handle); + EXPECT_THAT(DeleteQueue(), ElementsAre(handle, snapshot)); + EXPECT_FALSE(deleted); + EXPECT_FALSE(handle->SafeToDelete()); + + delete snapshot; + EXPECT_THAT(DeleteQueue(), SizeIs(0)); + EXPECT_TRUE(deleted); +} + +TEST(CordzHandleTest, MultiSnapshot) { + bool deleted[3] = {false, false, false}; + + CordzSnapshot* snapshot[3]; + CordzHandleDeleteTracker* handle[3]; + for (int i = 0; i < 3; ++i) { + snapshot[i] = new CordzSnapshot(); + handle[i] = new CordzHandleDeleteTracker(&deleted[i]); + CordzHandle::Delete(handle[i]); + } + + EXPECT_THAT(DeleteQueue(), ElementsAre(handle[2], snapshot[2], handle[1], + snapshot[1], handle[0], snapshot[0])); + EXPECT_THAT(deleted, ElementsAre(false, false, false)); + + delete snapshot[1]; + EXPECT_THAT(DeleteQueue(), ElementsAre(handle[2], snapshot[2], handle[1], + handle[0], snapshot[0])); + EXPECT_THAT(deleted, ElementsAre(false, false, false)); + + delete snapshot[0]; + EXPECT_THAT(DeleteQueue(), ElementsAre(handle[2], snapshot[2])); + EXPECT_THAT(deleted, ElementsAre(true, true, false)); + + delete snapshot[2]; + EXPECT_THAT(DeleteQueue(), SizeIs(0)); + EXPECT_THAT(deleted, ElementsAre(true, true, deleted)); +} + +TEST(CordzHandleTest, DiagnosticsHandleIsSafeToInspect) { + CordzSnapshot snapshot1; + EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(nullptr)); + + auto* handle1 = new CordzHandle(); + EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle1)); + + CordzHandle::Delete(handle1); + EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle1)); + + CordzSnapshot snapshot2; + auto* handle2 = new CordzHandle(); + EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle1)); + EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle2)); + EXPECT_FALSE(snapshot2.DiagnosticsHandleIsSafeToInspect(handle1)); + EXPECT_TRUE(snapshot2.DiagnosticsHandleIsSafeToInspect(handle2)); + + CordzHandle::Delete(handle2); + EXPECT_TRUE(snapshot1.DiagnosticsHandleIsSafeToInspect(handle1)); +} + +TEST(CordzHandleTest, DiagnosticsGetSafeToInspectDeletedHandles) { + EXPECT_THAT(DeleteQueue(), IsEmpty()); + + auto* handle = new CordzHandle(); + auto* snapshot1 = new CordzSnapshot(); + + // snapshot1 should be able to see handle. + EXPECT_THAT(DeleteQueue(), ElementsAre(snapshot1)); + EXPECT_TRUE(snapshot1->DiagnosticsHandleIsSafeToInspect(handle)); + EXPECT_THAT(snapshot1->DiagnosticsGetSafeToInspectDeletedHandles(), + IsEmpty()); + + // This handle will be safe to inspect as long as snapshot1 is alive. However, + // since only snapshot1 can prove that it's alive, it will be hidden from + // snapshot2. + CordzHandle::Delete(handle); + + // This snapshot shouldn't be able to see handle because handle was already + // sent to Delete. + auto* snapshot2 = new CordzSnapshot(); + + // DeleteQueue elements are LIFO order. + EXPECT_THAT(DeleteQueue(), ElementsAre(snapshot2, handle, snapshot1)); + + EXPECT_TRUE(snapshot1->DiagnosticsHandleIsSafeToInspect(handle)); + EXPECT_FALSE(snapshot2->DiagnosticsHandleIsSafeToInspect(handle)); + + EXPECT_THAT(snapshot1->DiagnosticsGetSafeToInspectDeletedHandles(), + ElementsAre(handle)); + EXPECT_THAT(snapshot2->DiagnosticsGetSafeToInspectDeletedHandles(), + IsEmpty()); + + CordzHandle::Delete(snapshot1); + EXPECT_THAT(DeleteQueue(), ElementsAre(snapshot2)); + + CordzHandle::Delete(snapshot2); + EXPECT_THAT(DeleteQueue(), IsEmpty()); +} + +// Create and delete CordzHandle and CordzSnapshot objects in multiple threads +// so that tsan has some time to chew on it and look for memory problems. +TEST(CordzHandleTest, MultiThreaded) { + Notification stop; + static constexpr int kNumThreads = 4; + // Keep the number of handles relatively small so that the test will naturally + // transition to an empty delete queue during the test. If there are, say, 100 + // handles, that will virtually never happen. With 10 handles and around 50k + // iterations in each of 4 threads, the delete queue appears to become empty + // around 200 times. + static constexpr int kNumHandles = 10; + + // Each thread is going to pick a random index and atomically swap its + // CordzHandle with one in handles. This way, each thread can avoid + // manipulating a CordzHandle that might be operated upon in another thread. + std::vector<std::atomic<CordzHandle*>> handles(kNumHandles); + + // global bool which is set when any thread did get some 'safe to inspect' + // handles. On some platforms and OSS tests, we might risk that some pool + // threads are starved, stalled, or just got a few unlikely random 'handle' + // coin tosses, so we satisfy this test with simply observing 'some' thread + // did something meaningful, which should minimize the potential for flakes. + std::atomic<bool> found_safe_to_inspect(false); + + { + absl::synchronization_internal::ThreadPool pool(kNumThreads); + for (int i = 0; i < kNumThreads; ++i) { + pool.Schedule([&stop, &handles, &found_safe_to_inspect]() { + std::minstd_rand gen; + std::uniform_int_distribution<int> dist_type(0, 2); + std::uniform_int_distribution<int> dist_handle(0, kNumHandles - 1); + + while (!stop.HasBeenNotified()) { + CordzHandle* handle; + switch (dist_type(gen)) { + case 0: + handle = new CordzHandle(); + break; + case 1: + handle = new CordzSnapshot(); + break; + default: + handle = nullptr; + break; + } + CordzHandle* old_handle = handles[dist_handle(gen)].exchange(handle); + if (old_handle != nullptr) { + std::vector<const CordzHandle*> safe_to_inspect = + old_handle->DiagnosticsGetSafeToInspectDeletedHandles(); + for (const CordzHandle* handle : safe_to_inspect) { + // We're in a tight loop, so don't generate too many error + // messages. + ASSERT_FALSE(handle->is_snapshot()); + } + if (!safe_to_inspect.empty()) { + found_safe_to_inspect.store(true); + } + CordzHandle::Delete(old_handle); + } + } + + // Have each thread attempt to clean up everything. Some thread will be + // the last to reach this cleanup code, and it will be guaranteed to + // clean up everything because nothing remains to create new handles. + for (auto& h : handles) { + if (CordzHandle* handle = h.exchange(nullptr)) { + CordzHandle::Delete(handle); + } + } + }); + } + + // The threads will hammer away. Give it a little bit of time for tsan to + // spot errors. + absl::SleepFor(absl::Seconds(3)); + stop.Notify(); + } + + // Confirm that the test did *something*. This check will be satisfied as + // long as any thread has deleted a CordzSnapshot object and a non-snapshot + // CordzHandle was deleted after the CordzSnapshot was created. + // See also comments on `found_safe_to_inspect` + EXPECT_TRUE(found_safe_to_inspect.load()); +} + +} // namespace +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_info.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_info.cc new file mode 100644 index 0000000000..a3a0b9c046 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_info.cc @@ -0,0 +1,436 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/cordz_info.h" + +#include "absl/base/config.h" +#include "absl/base/internal/spinlock.h" +#include "absl/container/inlined_vector.h" +#include "absl/debugging/stacktrace.h" +#include "absl/strings/internal/cord_internal.h" +#include "absl/strings/internal/cord_rep_ring.h" +#include "absl/strings/internal/cordz_handle.h" +#include "absl/strings/internal/cordz_statistics.h" +#include "absl/strings/internal/cordz_update_tracker.h" +#include "absl/synchronization/mutex.h" +#include "absl/types/span.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +using ::absl::base_internal::SpinLockHolder; + +constexpr int CordzInfo::kMaxStackDepth; + +ABSL_CONST_INIT CordzInfo::List CordzInfo::global_list_{absl::kConstInit}; + +namespace { + +// CordRepAnalyzer performs the analysis of a cord. +// +// It computes absolute node counts and total memory usage, and an 'estimated +// fair share memory usage` statistic. +// Conceptually, it divides the 'memory usage' at each location in the 'cord +// graph' by the cumulative reference count of that location. The cumulative +// reference count is the factored total of all edges leading into that node. +// +// The top level node is treated specially: we assume the current thread +// (typically called from the CordzHandler) to hold a reference purely to +// perform a safe analysis, and not being part of the application. So we +// substract 1 from the reference count of the top node to compute the +// 'application fair share' excluding the reference of the current thread. +// +// An example of fair sharing, and why we multiply reference counts: +// Assume we have 2 CordReps, both being a Substring referencing a Flat: +// CordSubstring A (refcount = 5) --> child Flat C (refcount = 2) +// CordSubstring B (refcount = 9) --> child Flat C (refcount = 2) +// +// Flat C has 2 incoming edges from the 2 substrings (refcount = 2) and is not +// referenced directly anywhere else. Translated into a 'fair share', we then +// attribute 50% of the memory (memory / refcount = 2) to each incoming edge. +// Rep A has a refcount of 5, so we attribute each incoming edge 1 / 5th of the +// memory cost below it, i.e.: the fair share of Rep A of the memory used by C +// is then 'memory C / (refcount C * refcount A) + (memory A / refcount A)'. +// It is also easy to see how all incoming edges add up to 100%. +class CordRepAnalyzer { + public: + // Creates an analyzer instance binding to `statistics`. + explicit CordRepAnalyzer(CordzStatistics& statistics) + : statistics_(statistics) {} + + // Analyzes the memory statistics and node counts for the provided `rep`, and + // adds the results to `statistics`. Note that node counts and memory sizes + // are not initialized, computed values are added to any existing values. + void AnalyzeCordRep(const CordRep* rep) { + // Process all linear nodes. + // As per the class comments, use refcout - 1 on the top level node, as the + // top level node is assumed to be referenced only for analysis purposes. + size_t refcount = rep->refcount.Get(); + RepRef repref{rep, (refcount > 1) ? refcount - 1 : 1}; + + // Process all top level linear nodes (substrings and flats). + repref = CountLinearReps(repref, memory_usage_); + + // We should have have either a concat or ring node node if not null. + if (repref.rep != nullptr) { + assert(repref.rep->tag == RING || repref.rep->tag == CONCAT); + if (repref.rep->tag == RING) { + AnalyzeRing(repref); + } else if (repref.rep->tag == CONCAT) { + AnalyzeConcat(repref); + } + } + + // Adds values to output + statistics_.estimated_memory_usage += memory_usage_.total; + statistics_.estimated_fair_share_memory_usage += memory_usage_.fair_share; + } + + private: + // RepRef identifies a CordRep* inside the Cord tree with its cumulative + // refcount including itself. For example, a tree consisting of a substring + // with a refcount of 3 and a child flat with a refcount of 4 will have RepRef + // refcounts of 3 and 12 respectively. + struct RepRef { + const CordRep* rep; + size_t refcount; + + // Returns a 'child' RepRef which contains the cumulative reference count of + // this instance multiplied by the child's reference count. + RepRef Child(const CordRep* child) const { + return RepRef{child, refcount * child->refcount.Get()}; + } + }; + + // Memory usage values + struct MemoryUsage { + size_t total = 0; + size_t fair_share = 0; + + // Adds 'size` memory usage to this class, with a cumulative (recursive) + // reference count of `refcount` + void Add(size_t size, size_t refcount) { + total += size; + fair_share += size / refcount; + } + }; + + // Returns `rr` if `rr.rep` is not null and a CONCAT type. + // Asserts that `rr.rep` is a concat node or null. + static RepRef AssertConcat(RepRef repref) { + const CordRep* rep = repref.rep; + assert(rep == nullptr || rep->tag == CONCAT); + return (rep != nullptr && rep->tag == CONCAT) ? repref : RepRef{nullptr, 0}; + } + + // Counts a flat of the provide allocated size + void CountFlat(size_t size) { + statistics_.node_count++; + statistics_.node_counts.flat++; + if (size <= 64) { + statistics_.node_counts.flat_64++; + } else if (size <= 128) { + statistics_.node_counts.flat_128++; + } else if (size <= 256) { + statistics_.node_counts.flat_256++; + } else if (size <= 512) { + statistics_.node_counts.flat_512++; + } else if (size <= 1024) { + statistics_.node_counts.flat_1k++; + } + } + + // Processes 'linear' reps (substring, flat, external) not requiring iteration + // or recursion. Returns RefRep{null} if all reps were processed, else returns + // the top-most non-linear concat or ring cordrep. + // Node counts are updated into `statistics_`, memory usage is update into + // `memory_usage`, which typically references `memory_usage_` except for ring + // buffers where we count children unrounded. + RepRef CountLinearReps(RepRef rep, MemoryUsage& memory_usage) { + // Consume all substrings + while (rep.rep->tag == SUBSTRING) { + statistics_.node_count++; + statistics_.node_counts.substring++; + memory_usage.Add(sizeof(CordRepSubstring), rep.refcount); + rep = rep.Child(rep.rep->substring()->child); + } + + // Consume possible FLAT + if (rep.rep->tag >= FLAT) { + size_t size = rep.rep->flat()->AllocatedSize(); + CountFlat(size); + memory_usage.Add(size, rep.refcount); + return RepRef{nullptr, 0}; + } + + // Consume possible external + if (rep.rep->tag == EXTERNAL) { + statistics_.node_count++; + statistics_.node_counts.external++; + size_t size = rep.rep->length + sizeof(CordRepExternalImpl<intptr_t>); + memory_usage.Add(size, rep.refcount); + return RepRef{nullptr, 0}; + } + + return rep; + } + + // Analyzes the provided concat node in a flattened recursive way. + void AnalyzeConcat(RepRef rep) { + absl::InlinedVector<RepRef, 47> pending; + + while (rep.rep != nullptr) { + const CordRepConcat* concat = rep.rep->concat(); + RepRef left = rep.Child(concat->left); + RepRef right = rep.Child(concat->right); + + statistics_.node_count++; + statistics_.node_counts.concat++; + memory_usage_.Add(sizeof(CordRepConcat), rep.refcount); + + right = AssertConcat(CountLinearReps(right, memory_usage_)); + rep = AssertConcat(CountLinearReps(left, memory_usage_)); + if (rep.rep != nullptr) { + if (right.rep != nullptr) { + pending.push_back(right); + } + } else if (right.rep != nullptr) { + rep = right; + } else if (!pending.empty()) { + rep = pending.back(); + pending.pop_back(); + } + } + } + + // Counts the provided ring buffer child into `child_usage`. + void CountRingChild(const CordRep* child, MemoryUsage& child_usage) { + RepRef rep{child, static_cast<size_t>(child->refcount.Get())}; + rep = CountLinearReps(rep, child_usage); + assert(rep.rep == nullptr); + } + + // Analyzes the provided ring. As ring buffers can have many child nodes, the + // effect of rounding errors can become non trivial, so we compute the totals + // first at the ring level, and then divide the fair share of the total + // including children fair share totals. + void AnalyzeRing(RepRef rep) { + statistics_.node_count++; + statistics_.node_counts.ring++; + MemoryUsage ring_usage; + const CordRepRing* ring = rep.rep->ring(); + ring_usage.Add(CordRepRing::AllocSize(ring->capacity()), 1); + ring->ForEach([&](CordRepRing::index_type pos) { + CountRingChild(ring->entry_child(pos), ring_usage); + }); + memory_usage_.total += ring_usage.total; + memory_usage_.fair_share += ring_usage.fair_share / rep.refcount; + } + + CordzStatistics& statistics_; + MemoryUsage memory_usage_; +}; + +} // namespace + +CordzInfo* CordzInfo::Head(const CordzSnapshot& snapshot) { + ABSL_ASSERT(snapshot.is_snapshot()); + + // We can do an 'unsafe' load of 'head', as we are guaranteed that the + // instance it points to is kept alive by the provided CordzSnapshot, so we + // can simply return the current value using an acquire load. + // We do enforce in DEBUG builds that the 'head' value is present in the + // delete queue: ODR violations may lead to 'snapshot' and 'global_list_' + // being in different libraries / modules. + CordzInfo* head = global_list_.head.load(std::memory_order_acquire); + ABSL_ASSERT(snapshot.DiagnosticsHandleIsSafeToInspect(head)); + return head; +} + +CordzInfo* CordzInfo::Next(const CordzSnapshot& snapshot) const { + ABSL_ASSERT(snapshot.is_snapshot()); + + // Similar to the 'Head()' function, we do not need a mutex here. + CordzInfo* next = ci_next_.load(std::memory_order_acquire); + ABSL_ASSERT(snapshot.DiagnosticsHandleIsSafeToInspect(this)); + ABSL_ASSERT(snapshot.DiagnosticsHandleIsSafeToInspect(next)); + return next; +} + +void CordzInfo::TrackCord(InlineData& cord, MethodIdentifier method) { + assert(cord.is_tree()); + assert(!cord.is_profiled()); + CordzInfo* cordz_info = new CordzInfo(cord.as_tree(), nullptr, method); + cord.set_cordz_info(cordz_info); + cordz_info->Track(); +} + +void CordzInfo::TrackCord(InlineData& cord, const InlineData& src, + MethodIdentifier method) { + assert(cord.is_tree()); + assert(src.is_tree()); + + // Unsample current as we the current cord is being replaced with 'src', + // so any method history is no longer relevant. + CordzInfo* cordz_info = cord.cordz_info(); + if (cordz_info != nullptr) cordz_info->Untrack(); + + // Start new cord sample + cordz_info = new CordzInfo(cord.as_tree(), src.cordz_info(), method); + cord.set_cordz_info(cordz_info); + cordz_info->Track(); +} + +void CordzInfo::MaybeTrackCordImpl(InlineData& cord, const InlineData& src, + MethodIdentifier method) { + if (src.is_profiled()) { + TrackCord(cord, src, method); + } else if (cord.is_profiled()) { + cord.cordz_info()->Untrack(); + cord.clear_cordz_info(); + } +} + +CordzInfo::MethodIdentifier CordzInfo::GetParentMethod(const CordzInfo* src) { + if (src == nullptr) return MethodIdentifier::kUnknown; + return src->parent_method_ != MethodIdentifier::kUnknown ? src->parent_method_ + : src->method_; +} + +int CordzInfo::FillParentStack(const CordzInfo* src, void** stack) { + assert(stack); + if (src == nullptr) return 0; + if (src->parent_stack_depth_) { + memcpy(stack, src->parent_stack_, src->parent_stack_depth_ * sizeof(void*)); + return src->parent_stack_depth_; + } + memcpy(stack, src->stack_, src->stack_depth_ * sizeof(void*)); + return src->stack_depth_; +} + +CordzInfo::CordzInfo(CordRep* rep, const CordzInfo* src, + MethodIdentifier method) + : rep_(rep), + stack_depth_(absl::GetStackTrace(stack_, /*max_depth=*/kMaxStackDepth, + /*skip_count=*/1)), + parent_stack_depth_(FillParentStack(src, parent_stack_)), + method_(method), + parent_method_(GetParentMethod(src)), + create_time_(absl::Now()) { + update_tracker_.LossyAdd(method); + if (src) { + // Copy parent counters. + update_tracker_.LossyAdd(src->update_tracker_); + } +} + +CordzInfo::~CordzInfo() { + // `rep_` is potentially kept alive if CordzInfo is included + // in a collection snapshot (which should be rare). + if (ABSL_PREDICT_FALSE(rep_)) { + CordRep::Unref(rep_); + } +} + +void CordzInfo::Track() { + SpinLockHolder l(&list_->mutex); + + CordzInfo* const head = list_->head.load(std::memory_order_acquire); + if (head != nullptr) { + head->ci_prev_.store(this, std::memory_order_release); + } + ci_next_.store(head, std::memory_order_release); + list_->head.store(this, std::memory_order_release); +} + +void CordzInfo::Untrack() { + ODRCheck(); + { + SpinLockHolder l(&list_->mutex); + + CordzInfo* const head = list_->head.load(std::memory_order_acquire); + CordzInfo* const next = ci_next_.load(std::memory_order_acquire); + CordzInfo* const prev = ci_prev_.load(std::memory_order_acquire); + + if (next) { + ABSL_ASSERT(next->ci_prev_.load(std::memory_order_acquire) == this); + next->ci_prev_.store(prev, std::memory_order_release); + } + if (prev) { + ABSL_ASSERT(head != this); + ABSL_ASSERT(prev->ci_next_.load(std::memory_order_acquire) == this); + prev->ci_next_.store(next, std::memory_order_release); + } else { + ABSL_ASSERT(head == this); + list_->head.store(next, std::memory_order_release); + } + } + + // We can no longer be discovered: perform a fast path check if we are not + // listed on any delete queue, so we can directly delete this instance. + if (SafeToDelete()) { + UnsafeSetCordRep(nullptr); + delete this; + return; + } + + // We are likely part of a snapshot, extend the life of the CordRep + { + absl::MutexLock lock(&mutex_); + if (rep_) CordRep::Ref(rep_); + } + CordzHandle::Delete(this); +} + +void CordzInfo::Lock(MethodIdentifier method) + ABSL_EXCLUSIVE_LOCK_FUNCTION(mutex_) { + mutex_.Lock(); + update_tracker_.LossyAdd(method); + assert(rep_); +} + +void CordzInfo::Unlock() ABSL_UNLOCK_FUNCTION(mutex_) { + bool tracked = rep_ != nullptr; + mutex_.Unlock(); + if (!tracked) { + Untrack(); + } +} + +absl::Span<void* const> CordzInfo::GetStack() const { + return absl::MakeConstSpan(stack_, stack_depth_); +} + +absl::Span<void* const> CordzInfo::GetParentStack() const { + return absl::MakeConstSpan(parent_stack_, parent_stack_depth_); +} + +CordzStatistics CordzInfo::GetCordzStatistics() const { + CordzStatistics stats; + stats.method = method_; + stats.parent_method = parent_method_; + stats.update_tracker = update_tracker_; + if (CordRep* rep = RefCordRep()) { + stats.size = rep->length; + CordRepAnalyzer analyzer(stats); + analyzer.AnalyzeCordRep(rep); + CordRep::Unref(rep); + } + return stats; +} + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_info.h b/third_party/abseil-cpp/absl/strings/internal/cordz_info.h new file mode 100644 index 0000000000..026d5b9981 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_info.h @@ -0,0 +1,298 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_STRINGS_CORDZ_INFO_H_ +#define ABSL_STRINGS_CORDZ_INFO_H_ + +#include <atomic> +#include <cstdint> +#include <functional> + +#include "absl/base/config.h" +#include "absl/base/internal/raw_logging.h" +#include "absl/base/internal/spinlock.h" +#include "absl/base/thread_annotations.h" +#include "absl/strings/internal/cord_internal.h" +#include "absl/strings/internal/cordz_functions.h" +#include "absl/strings/internal/cordz_handle.h" +#include "absl/strings/internal/cordz_statistics.h" +#include "absl/strings/internal/cordz_update_tracker.h" +#include "absl/synchronization/mutex.h" +#include "absl/types/span.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +// CordzInfo tracks a profiled Cord. Each of these objects can be in two places. +// If a Cord is alive, the CordzInfo will be in the global_cordz_infos map, and +// can also be retrieved via the linked list starting with +// global_cordz_infos_head and continued via the cordz_info_next() method. When +// a Cord has reached the end of its lifespan, the CordzInfo object will be +// migrated out of the global_cordz_infos list and the global_cordz_infos_map, +// and will either be deleted or appended to the global_delete_queue. If it is +// placed on the global_delete_queue, the CordzInfo object will be cleaned in +// the destructor of a CordzSampleToken object. +class ABSL_LOCKABLE CordzInfo : public CordzHandle { + public: + using MethodIdentifier = CordzUpdateTracker::MethodIdentifier; + + // TrackCord creates a CordzInfo instance which tracks important metrics of + // a sampled cord, and stores the created CordzInfo instance into `cord'. All + // CordzInfo instances are placed in a global list which is used to discover + // and snapshot all actively tracked cords. Callers are responsible for + // calling UntrackCord() before the tracked Cord instance is deleted, or to + // stop tracking the sampled Cord. Callers are also responsible for guarding + // changes to the 'tree' value of a Cord (InlineData.tree) through the Lock() + // and Unlock() calls. Any change resulting in a new tree value for the cord + // requires a call to SetCordRep() before the old tree has been unreffed + // and/or deleted. `method` identifies the Cord public API method initiating + // the cord to be sampled. + // Requires `cord` to hold a tree, and `cord.cordz_info()` to be null. + static void TrackCord(InlineData& cord, MethodIdentifier method); + + // Identical to TrackCord(), except that this function fills the + // `parent_stack` and `parent_method` properties of the returned CordzInfo + // instance from the provided `src` instance if `src` is sampled. + // This function should be used for sampling 'copy constructed' and 'copy + // assigned' cords. This function allows 'cord` to be already sampled, in + // which case the CordzInfo will be newly created from `src`. + static void TrackCord(InlineData& cord, const InlineData& src, + MethodIdentifier method); + + // Maybe sample the cord identified by 'cord' for method 'method'. + // Uses `cordz_should_profile` to randomly pick cords to be sampled, and if + // so, invokes `TrackCord` to start sampling `cord`. + static void MaybeTrackCord(InlineData& cord, MethodIdentifier method); + + // Maybe sample the cord identified by 'cord' for method 'method'. + // `src` identifies a 'parent' cord which is assigned to `cord`, typically the + // input cord for a copy constructor, or an assign method such as `operator=` + // `cord` will be sampled if (and only if) `src` is sampled. + // If `cord` is currently being sampled and `src` is not being sampled, then + // this function will stop sampling the cord and reset the cord's cordz_info. + // + // Previously this function defined that `cord` will be sampled if either + // `src` is sampled, or if `cord` is randomly picked for sampling. However, + // this can cause issues, as there may be paths where some cord is assigned an + // indirect copy of it's own value. As such a 'string of copies' would then + // remain sampled (`src.is_profiled`), then assigning such a cord back to + // 'itself' creates a cycle where the cord will converge to 'always sampled`. + // + // For example: + // + // Cord x; + // for (...) { + // // Copy ctor --> y.is_profiled := x.is_profiled | random(...) + // Cord y = x; + // ... + // // Assign x = y --> x.is_profiled = y.is_profiled | random(...) + // // ==> x.is_profiled |= random(...) + // // ==> x converges to 'always profiled' + // x = y; + // } + static void MaybeTrackCord(InlineData& cord, const InlineData& src, + MethodIdentifier method); + + // Stops tracking changes for a sampled cord, and deletes the provided info. + // This function must be called before the sampled cord instance is deleted, + // and before the root cordrep of the sampled cord is unreffed. + // This function may extend the lifetime of the cordrep in cases where the + // CordInfo instance is being held by a concurrent collection thread. + void Untrack(); + + // Invokes UntrackCord() on `info` if `info` is not null. + static void MaybeUntrackCord(CordzInfo* info); + + CordzInfo() = delete; + CordzInfo(const CordzInfo&) = delete; + CordzInfo& operator=(const CordzInfo&) = delete; + + // Retrieves the oldest existing CordzInfo. + static CordzInfo* Head(const CordzSnapshot& snapshot) + ABSL_NO_THREAD_SAFETY_ANALYSIS; + + // Retrieves the next oldest existing CordzInfo older than 'this' instance. + CordzInfo* Next(const CordzSnapshot& snapshot) const + ABSL_NO_THREAD_SAFETY_ANALYSIS; + + // Locks this instance for the update identified by `method`. + // Increases the count for `method` in `update_tracker`. + void Lock(MethodIdentifier method) ABSL_EXCLUSIVE_LOCK_FUNCTION(mutex_); + + // Unlocks this instance. If the contained `rep` has been set to null + // indicating the Cord has been cleared or is otherwise no longer sampled, + // then this method will delete this CordzInfo instance. + void Unlock() ABSL_UNLOCK_FUNCTION(mutex_); + + // Asserts that this CordzInfo instance is locked. + void AssertHeld() ABSL_ASSERT_EXCLUSIVE_LOCK(mutex_); + + // Updates the `rep` property of this instance. This methods is invoked by + // Cord logic each time the root node of a sampled Cord changes, and before + // the old root reference count is deleted. This guarantees that collection + // code can always safely take a reference on the tracked cord. + // Requires a lock to be held through the `Lock()` method. + // TODO(b/117940323): annotate with ABSL_EXCLUSIVE_LOCKS_REQUIRED once all + // Cord code is in a state where this can be proven true by the compiler. + void SetCordRep(CordRep* rep); + + // Returns the current `rep` property of this instance with a reference + // added, or null if this instance represents a cord that has since been + // deleted or untracked. + CordRep* RefCordRep() const ABSL_LOCKS_EXCLUDED(mutex_); + + // Returns the current value of `rep_` for testing purposes only. + CordRep* GetCordRepForTesting() const ABSL_NO_THREAD_SAFETY_ANALYSIS { + return rep_; + } + + // Sets the current value of `rep_` for testing purposes only. + void SetCordRepForTesting(CordRep* rep) ABSL_NO_THREAD_SAFETY_ANALYSIS { + rep_ = rep; + } + + // Returns the stack trace for where the cord was first sampled. Cords are + // potentially sampled when they promote from an inlined cord to a tree or + // ring representation, which is not necessarily the location where the cord + // was first created. Some cords are created as inlined cords, and only as + // data is added do they become a non-inlined cord. However, typically the + // location represents reasonably well where the cord is 'created'. + absl::Span<void* const> GetStack() const; + + // Returns the stack trace for a sampled cord's 'parent stack trace'. This + // value may be set if the cord is sampled (promoted) after being created + // from, or being assigned the value of an existing (sampled) cord. + absl::Span<void* const> GetParentStack() const; + + // Retrieves the CordzStatistics associated with this Cord. The statistics + // are only updated when a Cord goes through a mutation, such as an Append + // or RemovePrefix. + CordzStatistics GetCordzStatistics() const; + + private: + using SpinLock = absl::base_internal::SpinLock; + using SpinLockHolder = ::absl::base_internal::SpinLockHolder; + + // Global cordz info list. CordzInfo stores a pointer to the global list + // instance to harden against ODR violations. + struct List { + constexpr explicit List(absl::ConstInitType) + : mutex(absl::kConstInit, + absl::base_internal::SCHEDULE_COOPERATIVE_AND_KERNEL) {} + + SpinLock mutex; + std::atomic<CordzInfo*> head ABSL_GUARDED_BY(mutex){nullptr}; + }; + + static constexpr int kMaxStackDepth = 64; + + explicit CordzInfo(CordRep* rep, const CordzInfo* src, + MethodIdentifier method); + ~CordzInfo() override; + + // Sets `rep_` without holding a lock. + void UnsafeSetCordRep(CordRep* rep) ABSL_NO_THREAD_SAFETY_ANALYSIS; + + void Track(); + + // Returns the parent method from `src`, which is either `parent_method_` or + // `method_` depending on `parent_method_` being kUnknown. + // Returns kUnknown if `src` is null. + static MethodIdentifier GetParentMethod(const CordzInfo* src); + + // Fills the provided stack from `src`, copying either `parent_stack_` or + // `stack_` depending on `parent_stack_` being empty, returning the size of + // the parent stack. + // Returns 0 if `src` is null. + static int FillParentStack(const CordzInfo* src, void** stack); + + void ODRCheck() const { +#ifndef NDEBUG + ABSL_RAW_CHECK(list_ == &global_list_, "ODR violation in Cord"); +#endif + } + + // Non-inlined implementation of `MaybeTrackCord`, which is executed if + // either `src` is sampled or `cord` is sampled, and either untracks or + // tracks `cord` as documented per `MaybeTrackCord`. + static void MaybeTrackCordImpl(InlineData& cord, const InlineData& src, + MethodIdentifier method); + + ABSL_CONST_INIT static List global_list_; + List* const list_ = &global_list_; + + // ci_prev_ and ci_next_ require the global list mutex to be held. + // Unfortunately we can't use thread annotations such that the thread safety + // analysis understands that list_ and global_list_ are one and the same. + std::atomic<CordzInfo*> ci_prev_{nullptr}; + std::atomic<CordzInfo*> ci_next_{nullptr}; + + mutable absl::Mutex mutex_; + CordRep* rep_ ABSL_GUARDED_BY(mutex_); + + void* stack_[kMaxStackDepth]; + void* parent_stack_[kMaxStackDepth]; + const int stack_depth_; + const int parent_stack_depth_; + const MethodIdentifier method_; + const MethodIdentifier parent_method_; + CordzUpdateTracker update_tracker_; + const absl::Time create_time_; +}; + +inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CordzInfo::MaybeTrackCord( + InlineData& cord, MethodIdentifier method) { + if (ABSL_PREDICT_FALSE(cordz_should_profile())) { + TrackCord(cord, method); + } +} + +inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CordzInfo::MaybeTrackCord( + InlineData& cord, const InlineData& src, MethodIdentifier method) { + if (ABSL_PREDICT_FALSE(InlineData::is_either_profiled(cord, src))) { + MaybeTrackCordImpl(cord, src, method); + } +} + +inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CordzInfo::MaybeUntrackCord( + CordzInfo* info) { + if (ABSL_PREDICT_FALSE(info)) { + info->Untrack(); + } +} + +inline void CordzInfo::AssertHeld() ABSL_ASSERT_EXCLUSIVE_LOCK(mutex_) { +#ifndef NDEBUG + mutex_.AssertHeld(); +#endif +} + +inline void CordzInfo::SetCordRep(CordRep* rep) { + AssertHeld(); + rep_ = rep; +} + +inline void CordzInfo::UnsafeSetCordRep(CordRep* rep) { rep_ = rep; } + +inline CordRep* CordzInfo::RefCordRep() const ABSL_LOCKS_EXCLUDED(mutex_) { + MutexLock lock(&mutex_); + return rep_ ? CordRep::Ref(rep_) : nullptr; +} + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_CORDZ_INFO_H_ diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_info_statistics_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_info_statistics_test.cc new file mode 100644 index 0000000000..9f2842d97d --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_info_statistics_test.cc @@ -0,0 +1,508 @@ +// Copyright 2021 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <iostream> +#include <random> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/config.h" +#include "absl/strings/cord.h" +#include "absl/strings/internal/cord_internal.h" +#include "absl/strings/internal/cord_rep_flat.h" +#include "absl/strings/internal/cord_rep_ring.h" +#include "absl/strings/internal/cordz_info.h" +#include "absl/strings/internal/cordz_sample_token.h" +#include "absl/strings/internal/cordz_statistics.h" +#include "absl/strings/internal/cordz_update_scope.h" +#include "absl/strings/internal/cordz_update_tracker.h" +#include "absl/synchronization/internal/thread_pool.h" +#include "absl/synchronization/notification.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +// Do not print statistics contents, the matcher prints them as needed. +inline void PrintTo(const CordzStatistics& stats, std::ostream* s) { + if (s) *s << "CordzStatistics{...}"; +} + +namespace { + +// Creates a flat of the specified allocated size +CordRepFlat* Flat(size_t size) { + // Round up to a tag size, as we are going to poke an exact tag size back into + // the allocated flat. 'size returning allocators' could grant us more than we + // wanted, but we are ok to poke the 'requested' size in the tag, even in the + // presence of sized deletes, so we need to make sure the size rounds + // perfectly to a tag value. + assert(size >= kMinFlatSize); + size = RoundUpForTag(size); + CordRepFlat* flat = CordRepFlat::New(size - kFlatOverhead); + flat->tag = AllocatedSizeToTag(size); + flat->length = size - kFlatOverhead; + return flat; +} + +// Creates an external of the specified length +CordRepExternal* External(int length = 512) { + return static_cast<CordRepExternal*>( + NewExternalRep(absl::string_view("", length), [](absl::string_view) {})); +} + +// Creates a substring on the provided rep of length - 1 +CordRepSubstring* Substring(CordRep* rep) { + auto* substring = new CordRepSubstring; + substring->length = rep->length - 1; + substring->tag = SUBSTRING; + substring->child = rep; + return substring; +} + +// Creates a concat on the provided reps +CordRepConcat* Concat(CordRep* left, CordRep* right) { + auto* concat = new CordRepConcat; + concat->length = left->length + right->length; + concat->tag = CONCAT; + concat->left = left; + concat->right = right; + return concat; +} + +// Reference count helper +struct RefHelper { + std::vector<CordRep*> refs; + + ~RefHelper() { + for (CordRep* rep : refs) { + CordRep::Unref(rep); + } + } + + // Invokes CordRep::Unref() on `rep` when this instance is destroyed. + template <typename T> + T* NeedsUnref(T* rep) { + refs.push_back(rep); + return rep; + } + + // Adds `n` reference counts to `rep` which will be unreffed when this + // instance is destroyed. + template <typename T> + T* Ref(T* rep, size_t n = 1) { + while (n--) { + NeedsUnref(CordRep::Ref(rep)); + } + return rep; + } +}; + +// Sizeof helper. Returns the allocated size of `p`, excluding any child +// elements for substring, concat and ring cord reps. +template <typename T> +size_t SizeOf(const T* rep) { + return sizeof(T); +} + +template <> +size_t SizeOf(const CordRepFlat* rep) { + return rep->AllocatedSize(); +} + +template <> +size_t SizeOf(const CordRepExternal* rep) { + // See cord.cc + return sizeof(CordRepExternalImpl<intptr_t>) + rep->length; +} + +template <> +size_t SizeOf(const CordRepRing* rep) { + return CordRepRing::AllocSize(rep->capacity()); +} + +// Computes fair share memory used in a naive 'we dare to recurse' way. +size_t FairShare(CordRep* rep, size_t ref = 1) { + size_t self = 0, children = 0; + ref *= rep->refcount.Get(); + if (rep->tag >= FLAT) { + self = SizeOf(rep->flat()); + } else if (rep->tag == EXTERNAL) { + self = SizeOf(rep->external()); + } else if (rep->tag == SUBSTRING) { + self = SizeOf(rep->substring()); + children = FairShare(rep->substring()->child, ref); + } else if (rep->tag == RING) { + self = SizeOf(rep->ring()); + rep->ring()->ForEach([&](CordRepRing::index_type i) { + self += FairShare(rep->ring()->entry_child(i)); + }); + } else if (rep->tag == CONCAT) { + self = SizeOf(rep->concat()); + children = FairShare(rep->concat()->left, ref) + + FairShare(rep->concat()->right, ref); + } else { + assert(false); + } + return self / ref + children; +} + +// Samples the cord and returns CordzInfo::GetStatistics() +CordzStatistics SampleCord(CordRep* rep) { + InlineData cord(rep); + CordzInfo::TrackCord(cord, CordzUpdateTracker::kUnknown); + CordzStatistics stats = cord.cordz_info()->GetCordzStatistics(); + cord.cordz_info()->Untrack(); + return stats; +} + +MATCHER_P(EqStatistics, stats, "Statistics equal expected values") { + bool ok = true; + +#define STATS_MATCHER_EXPECT_EQ(member) \ + if (stats.member != arg.member) { \ + *result_listener << "\n stats." << #member \ + << ": actual = " << arg.member << ", expected " \ + << stats.member; \ + ok = false; \ + } + + STATS_MATCHER_EXPECT_EQ(size); + STATS_MATCHER_EXPECT_EQ(node_count); + STATS_MATCHER_EXPECT_EQ(node_counts.flat); + STATS_MATCHER_EXPECT_EQ(node_counts.flat_64); + STATS_MATCHER_EXPECT_EQ(node_counts.flat_128); + STATS_MATCHER_EXPECT_EQ(node_counts.flat_256); + STATS_MATCHER_EXPECT_EQ(node_counts.flat_512); + STATS_MATCHER_EXPECT_EQ(node_counts.flat_1k); + STATS_MATCHER_EXPECT_EQ(node_counts.external); + STATS_MATCHER_EXPECT_EQ(node_counts.concat); + STATS_MATCHER_EXPECT_EQ(node_counts.substring); + STATS_MATCHER_EXPECT_EQ(node_counts.ring); + STATS_MATCHER_EXPECT_EQ(estimated_memory_usage); + STATS_MATCHER_EXPECT_EQ(estimated_fair_share_memory_usage); + +#undef STATS_MATCHER_EXPECT_EQ + + return ok; +} + +TEST(CordzInfoStatisticsTest, Flat) { + RefHelper ref; + auto* flat = ref.NeedsUnref(Flat(512)); + + CordzStatistics expected; + expected.size = flat->length; + expected.estimated_memory_usage = SizeOf(flat); + expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage; + expected.node_count = 1; + expected.node_counts.flat = 1; + expected.node_counts.flat_512 = 1; + + EXPECT_THAT(SampleCord(flat), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, SharedFlat) { + RefHelper ref; + auto* flat = ref.Ref(ref.NeedsUnref(Flat(64))); + + CordzStatistics expected; + expected.size = flat->length; + expected.estimated_memory_usage = SizeOf(flat); + expected.estimated_fair_share_memory_usage = SizeOf(flat) / 2; + expected.node_count = 1; + expected.node_counts.flat = 1; + expected.node_counts.flat_64 = 1; + + EXPECT_THAT(SampleCord(flat), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, External) { + RefHelper ref; + auto* external = ref.NeedsUnref(External()); + + CordzStatistics expected; + expected.size = external->length; + expected.estimated_memory_usage = SizeOf(external); + expected.estimated_fair_share_memory_usage = SizeOf(external); + expected.node_count = 1; + expected.node_counts.external = 1; + + EXPECT_THAT(SampleCord(external), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, SharedExternal) { + RefHelper ref; + auto* external = ref.Ref(ref.NeedsUnref(External())); + + CordzStatistics expected; + expected.size = external->length; + expected.estimated_memory_usage = SizeOf(external); + expected.estimated_fair_share_memory_usage = SizeOf(external) / 2; + expected.node_count = 1; + expected.node_counts.external = 1; + + EXPECT_THAT(SampleCord(external), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, Substring) { + RefHelper ref; + auto* flat = Flat(1024); + auto* substring = ref.NeedsUnref(Substring(flat)); + + CordzStatistics expected; + expected.size = substring->length; + expected.estimated_memory_usage = SizeOf(substring) + SizeOf(flat); + expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage; + expected.node_count = 2; + expected.node_counts.flat = 1; + expected.node_counts.flat_1k = 1; + expected.node_counts.substring = 1; + + EXPECT_THAT(SampleCord(substring), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, SharedSubstring) { + RefHelper ref; + auto* flat = ref.Ref(Flat(511), 2); + auto* substring = ref.Ref(ref.NeedsUnref(Substring(flat))); + + CordzStatistics expected; + expected.size = substring->length; + expected.estimated_memory_usage = SizeOf(flat) + SizeOf(substring); + expected.estimated_fair_share_memory_usage = + SizeOf(substring) / 2 + SizeOf(flat) / 6; + expected.node_count = 2; + expected.node_counts.flat = 1; + expected.node_counts.flat_512 = 1; + expected.node_counts.substring = 1; + + EXPECT_THAT(SampleCord(substring), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, Concat) { + RefHelper ref; + auto* flat1 = Flat(300); + auto* flat2 = Flat(2000); + auto* concat = ref.NeedsUnref(Concat(flat1, flat2)); + + CordzStatistics expected; + expected.size = concat->length; + expected.estimated_memory_usage = + SizeOf(concat) + SizeOf(flat1) + SizeOf(flat2); + expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage; + expected.node_count = 3; + expected.node_counts.flat = 2; + expected.node_counts.flat_512 = 1; + expected.node_counts.concat = 1; + + EXPECT_THAT(SampleCord(concat), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, DeepConcat) { + RefHelper ref; + auto* flat1 = Flat(300); + auto* flat2 = Flat(2000); + auto* flat3 = Flat(400); + auto* external = External(3000); + auto* substring = Substring(external); + auto* concat1 = Concat(flat1, flat2); + auto* concat2 = Concat(flat3, substring); + auto* concat = ref.NeedsUnref(Concat(concat1, concat2)); + + CordzStatistics expected; + expected.size = concat->length; + expected.estimated_memory_usage = SizeOf(concat) * 3 + SizeOf(flat1) + + SizeOf(flat2) + SizeOf(flat3) + + SizeOf(external) + SizeOf(substring); + expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage; + + expected.node_count = 8; + expected.node_counts.flat = 3; + expected.node_counts.flat_512 = 2; + expected.node_counts.external = 1; + expected.node_counts.concat = 3; + expected.node_counts.substring = 1; + + EXPECT_THAT(SampleCord(concat), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, DeepSharedConcat) { + RefHelper ref; + auto* flat1 = Flat(40); + auto* flat2 = ref.Ref(Flat(2000), 4); + auto* flat3 = Flat(70); + auto* external = ref.Ref(External(3000)); + auto* substring = ref.Ref(Substring(external), 3); + auto* concat1 = Concat(flat1, flat2); + auto* concat2 = Concat(flat3, substring); + auto* concat = ref.Ref(ref.NeedsUnref(Concat(concat1, concat2))); + + CordzStatistics expected; + expected.size = concat->length; + expected.estimated_memory_usage = SizeOf(concat) * 3 + SizeOf(flat1) + + SizeOf(flat2) + SizeOf(flat3) + + SizeOf(external) + SizeOf(substring); + expected.estimated_fair_share_memory_usage = FairShare(concat); + expected.node_count = 8; + expected.node_counts.flat = 3; + expected.node_counts.flat_64 = 1; + expected.node_counts.flat_128 = 1; + expected.node_counts.external = 1; + expected.node_counts.concat = 3; + expected.node_counts.substring = 1; + + EXPECT_THAT(SampleCord(concat), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, Ring) { + RefHelper ref; + auto* flat1 = Flat(240); + auto* flat2 = Flat(2000); + auto* flat3 = Flat(70); + auto* external = External(3000); + CordRepRing* ring = CordRepRing::Create(flat1); + ring = CordRepRing::Append(ring, flat2); + ring = CordRepRing::Append(ring, flat3); + ring = ref.NeedsUnref(CordRepRing::Append(ring, external)); + + CordzStatistics expected; + expected.size = ring->length; + expected.estimated_memory_usage = SizeOf(ring) + SizeOf(flat1) + + SizeOf(flat2) + SizeOf(flat3) + + SizeOf(external); + expected.estimated_fair_share_memory_usage = expected.estimated_memory_usage; + expected.node_count = 5; + expected.node_counts.flat = 3; + expected.node_counts.flat_128 = 1; + expected.node_counts.flat_256 = 1; + expected.node_counts.external = 1; + expected.node_counts.ring = 1; + + EXPECT_THAT(SampleCord(ring), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, SharedSubstringRing) { + RefHelper ref; + auto* flat1 = ref.Ref(Flat(240)); + auto* flat2 = Flat(200); + auto* flat3 = Flat(70); + auto* external = ref.Ref(External(3000), 5); + CordRepRing* ring = CordRepRing::Create(flat1); + ring = CordRepRing::Append(ring, flat2); + ring = CordRepRing::Append(ring, flat3); + ring = ref.Ref(CordRepRing::Append(ring, external), 4); + auto* substring = ref.Ref(ref.NeedsUnref(Substring(ring))); + + + CordzStatistics expected; + expected.size = substring->length; + expected.estimated_memory_usage = SizeOf(ring) + SizeOf(flat1) + + SizeOf(flat2) + SizeOf(flat3) + + SizeOf(external) + SizeOf(substring); + expected.estimated_fair_share_memory_usage = FairShare(substring); + expected.node_count = 6; + expected.node_counts.flat = 3; + expected.node_counts.flat_128 = 1; + expected.node_counts.flat_256 = 2; + expected.node_counts.external = 1; + expected.node_counts.ring = 1; + expected.node_counts.substring = 1; + + EXPECT_THAT(SampleCord(substring), EqStatistics(expected)); +} + +TEST(CordzInfoStatisticsTest, ThreadSafety) { + Notification stop; + static constexpr int kNumThreads = 8; + int64_t sampled_node_count = 0; + + { + absl::synchronization_internal::ThreadPool pool(kNumThreads); + + // Run analyzer thread emulating a CordzHandler collection. + pool.Schedule([&]() { + while (!stop.HasBeenNotified()) { + // Run every 10us (about 100K total collections). + absl::SleepFor(absl::Microseconds(10)); + CordzSampleToken token; + for (const CordzInfo& cord_info : token) { + CordzStatistics stats = cord_info.GetCordzStatistics(); + sampled_node_count += stats.node_count; + } + } + }); + + // Run 'application threads' + for (int i = 0; i < kNumThreads; ++i) { + pool.Schedule([&]() { + // Track 0 - 2 cordz infos at a time, providing permutations of 0, 1 + // and 2 CordzHandle and CordzInfo queues being active, with plenty of + // 'empty to non empty' transitions. + InlineData cords[2]; + std::minstd_rand gen; + std::uniform_int_distribution<int> coin_toss(0, 1); + + while (!stop.HasBeenNotified()) { + for (InlineData& cord : cords) { + // 50/50 flip the state of the cord + if (coin_toss(gen) != 0) { + if (cord.is_tree()) { + // 50/50 simulate delete (untrack) or 'edit to empty' + if (coin_toss(gen) != 0) { + CordzInfo::MaybeUntrackCord(cord.cordz_info()); + } else { + CordzUpdateScope scope(cord.cordz_info(), + CordzUpdateTracker::kUnknown); + scope.SetCordRep(nullptr); + } + CordRep::Unref(cord.as_tree()); + cord.set_inline_size(0); + } else { + // 50/50 Ring or Flat coin toss + CordRep* rep = Flat(256); + rep = (coin_toss(gen) != 0) ? CordRepRing::Create(rep) : rep; + cord.make_tree(rep); + + // 50/50 sample + if (coin_toss(gen) != 0) { + CordzInfo::TrackCord(cord, CordzUpdateTracker::kUnknown); + } + } + } + } + } + for (InlineData& cord : cords) { + if (cord.is_tree()) { + CordzInfo::MaybeUntrackCord(cord.cordz_info()); + CordRep::Unref(cord.as_tree()); + } + } + }); + } + + // Run for 1 second to give memory and thread safety analyzers plenty of + // time to detect any mishaps or undefined behaviors. + absl::SleepFor(absl::Seconds(1)); + stop.Notify(); + } + + std::cout << "Sampled " << sampled_node_count << " nodes\n"; +} + +} // namespace +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_info_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_info_test.cc new file mode 100644 index 0000000000..b98343ae79 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_info_test.cc @@ -0,0 +1,341 @@ +// Copyright 2019 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/cordz_info.h" + +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/config.h" +#include "absl/debugging/stacktrace.h" +#include "absl/debugging/symbolize.h" +#include "absl/strings/cordz_test_helpers.h" +#include "absl/strings/internal/cord_rep_flat.h" +#include "absl/strings/internal/cordz_handle.h" +#include "absl/strings/internal/cordz_statistics.h" +#include "absl/strings/internal/cordz_update_tracker.h" +#include "absl/strings/str_cat.h" +#include "absl/types/span.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::HasSubstr; +using ::testing::Ne; +using ::testing::SizeIs; + +// Used test values +auto constexpr kUnknownMethod = CordzUpdateTracker::kUnknown; +auto constexpr kTrackCordMethod = CordzUpdateTracker::kConstructorString; +auto constexpr kChildMethod = CordzUpdateTracker::kConstructorCord; +auto constexpr kUpdateMethod = CordzUpdateTracker::kAppendString; + +// Local less verbose helper +std::vector<const CordzHandle*> DeleteQueue() { + return CordzHandle::DiagnosticsGetDeleteQueue(); +} + +std::string FormatStack(absl::Span<void* const> raw_stack) { + static constexpr size_t buf_size = 1 << 14; + std::unique_ptr<char[]> buf(new char[buf_size]); + std::string output; + for (void* stackp : raw_stack) { + if (absl::Symbolize(stackp, buf.get(), buf_size)) { + absl::StrAppend(&output, " ", buf.get(), "\n"); + } + } + return output; +} + +TEST(CordzInfoTest, TrackCord) { + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info = data.data.cordz_info(); + ASSERT_THAT(info, Ne(nullptr)); + EXPECT_FALSE(info->is_snapshot()); + EXPECT_THAT(CordzInfo::Head(CordzSnapshot()), Eq(info)); + EXPECT_THAT(info->GetCordRepForTesting(), Eq(data.rep.rep)); + info->Untrack(); +} + +TEST(CordzInfoTest, MaybeTrackChildCordWithoutSampling) { + CordzSamplingIntervalHelper sample_none(99999); + TestCordData parent, child; + CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod); + EXPECT_THAT(child.data.cordz_info(), Eq(nullptr)); +} + +TEST(CordzInfoTest, MaybeTrackChildCordWithSampling) { + CordzSamplingIntervalHelper sample_all(1); + TestCordData parent, child; + CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod); + EXPECT_THAT(child.data.cordz_info(), Eq(nullptr)); +} + +TEST(CordzInfoTest, MaybeTrackChildCordWithoutSamplingParentSampled) { + CordzSamplingIntervalHelper sample_none(99999); + TestCordData parent, child; + CordzInfo::TrackCord(parent.data, kTrackCordMethod); + CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod); + CordzInfo* parent_info = parent.data.cordz_info(); + CordzInfo* child_info = child.data.cordz_info(); + ASSERT_THAT(child_info, Ne(nullptr)); + EXPECT_THAT(child_info->GetCordRepForTesting(), Eq(child.rep.rep)); + EXPECT_THAT(child_info->GetParentStack(), parent_info->GetStack()); + parent_info->Untrack(); + child_info->Untrack(); +} + +TEST(CordzInfoTest, MaybeTrackChildCordWithoutSamplingChildSampled) { + CordzSamplingIntervalHelper sample_none(99999); + TestCordData parent, child; + CordzInfo::TrackCord(child.data, kTrackCordMethod); + CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod); + EXPECT_THAT(child.data.cordz_info(), Eq(nullptr)); +} + +TEST(CordzInfoTest, MaybeTrackChildCordWithSamplingChildSampled) { + CordzSamplingIntervalHelper sample_all(1); + TestCordData parent, child; + CordzInfo::TrackCord(child.data, kTrackCordMethod); + CordzInfo::MaybeTrackCord(child.data, parent.data, kTrackCordMethod); + EXPECT_THAT(child.data.cordz_info(), Eq(nullptr)); +} + +TEST(CordzInfoTest, UntrackCord) { + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info = data.data.cordz_info(); + + info->Untrack(); + EXPECT_THAT(DeleteQueue(), SizeIs(0)); +} + +TEST(CordzInfoTest, UntrackCordWithSnapshot) { + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info = data.data.cordz_info(); + + CordzSnapshot snapshot; + info->Untrack(); + EXPECT_THAT(CordzInfo::Head(CordzSnapshot()), Eq(nullptr)); + EXPECT_THAT(info->GetCordRepForTesting(), Eq(data.rep.rep)); + EXPECT_THAT(DeleteQueue(), ElementsAre(info, &snapshot)); +} + +TEST(CordzInfoTest, SetCordRep) { + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info = data.data.cordz_info(); + + TestCordRep rep; + info->Lock(CordzUpdateTracker::kAppendCord); + info->SetCordRep(rep.rep); + info->Unlock(); + EXPECT_THAT(info->GetCordRepForTesting(), Eq(rep.rep)); + + info->Untrack(); +} + +TEST(CordzInfoTest, SetCordRepNullUntracksCordOnUnlock) { + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info = data.data.cordz_info(); + + info->Lock(CordzUpdateTracker::kAppendString); + info->SetCordRep(nullptr); + EXPECT_THAT(info->GetCordRepForTesting(), Eq(nullptr)); + EXPECT_THAT(CordzInfo::Head(CordzSnapshot()), Eq(info)); + + info->Unlock(); + EXPECT_THAT(CordzInfo::Head(CordzSnapshot()), Eq(nullptr)); +} + +TEST(CordzInfoTest, RefCordRep) { + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info = data.data.cordz_info(); + + size_t refcount = data.rep.rep->refcount.Get(); + EXPECT_THAT(info->RefCordRep(), Eq(data.rep.rep)); + EXPECT_THAT(data.rep.rep->refcount.Get(), Eq(refcount + 1)); + CordRep::Unref(data.rep.rep); + info->Untrack(); +} + +#if GTEST_HAS_DEATH_TEST + +TEST(CordzInfoTest, SetCordRepRequiresMutex) { + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info = data.data.cordz_info(); + TestCordRep rep; + EXPECT_DEBUG_DEATH(info->SetCordRep(rep.rep), ".*"); + info->Untrack(); +} + +#endif // GTEST_HAS_DEATH_TEST + +TEST(CordzInfoTest, TrackUntrackHeadFirstV2) { + CordzSnapshot snapshot; + EXPECT_THAT(CordzInfo::Head(snapshot), Eq(nullptr)); + + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info1 = data.data.cordz_info(); + ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info1)); + EXPECT_THAT(info1->Next(snapshot), Eq(nullptr)); + + TestCordData data2; + CordzInfo::TrackCord(data2.data, kTrackCordMethod); + CordzInfo* info2 = data2.data.cordz_info(); + ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info2)); + EXPECT_THAT(info2->Next(snapshot), Eq(info1)); + EXPECT_THAT(info1->Next(snapshot), Eq(nullptr)); + + info2->Untrack(); + ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info1)); + EXPECT_THAT(info1->Next(snapshot), Eq(nullptr)); + + info1->Untrack(); + ASSERT_THAT(CordzInfo::Head(snapshot), Eq(nullptr)); +} + +TEST(CordzInfoTest, TrackUntrackTailFirstV2) { + CordzSnapshot snapshot; + EXPECT_THAT(CordzInfo::Head(snapshot), Eq(nullptr)); + + TestCordData data; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info1 = data.data.cordz_info(); + ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info1)); + EXPECT_THAT(info1->Next(snapshot), Eq(nullptr)); + + TestCordData data2; + CordzInfo::TrackCord(data2.data, kTrackCordMethod); + CordzInfo* info2 = data2.data.cordz_info(); + ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info2)); + EXPECT_THAT(info2->Next(snapshot), Eq(info1)); + EXPECT_THAT(info1->Next(snapshot), Eq(nullptr)); + + info1->Untrack(); + ASSERT_THAT(CordzInfo::Head(snapshot), Eq(info2)); + EXPECT_THAT(info2->Next(snapshot), Eq(nullptr)); + + info2->Untrack(); + ASSERT_THAT(CordzInfo::Head(snapshot), Eq(nullptr)); +} + +TEST(CordzInfoTest, StackV2) { + TestCordData data; + // kMaxStackDepth is intentionally less than 64 (which is the max depth that + // Cordz will record) because if the actual stack depth is over 64 + // (which it is on Apple platforms) then the expected_stack will end up + // catching a few frames at the end that the actual_stack didn't get and + // it will no longer be subset. At the time of this writing 58 is the max + // that will allow this test to pass (with a minimum os version of iOS 9), so + // rounded down to 50 to hopefully not run into this in the future if Apple + // makes small modifications to its testing stack. 50 is sufficient to prove + // that we got a decent stack. + static constexpr int kMaxStackDepth = 50; + CordzInfo::TrackCord(data.data, kTrackCordMethod); + CordzInfo* info = data.data.cordz_info(); + std::vector<void*> local_stack; + local_stack.resize(kMaxStackDepth); + // In some environments we don't get stack traces. For example in Android + // absl::GetStackTrace will return 0 indicating it didn't find any stack. The + // resultant formatted stack will be "", but that still equals the stack + // recorded in CordzInfo, which is also empty. The skip_count is 1 so that the + // line number of the current stack isn't included in the HasSubstr check. + local_stack.resize(absl::GetStackTrace(local_stack.data(), kMaxStackDepth, + /*skip_count=*/1)); + + std::string got_stack = FormatStack(info->GetStack()); + std::string expected_stack = FormatStack(local_stack); + // If TrackCord is inlined, got_stack should match expected_stack. If it isn't + // inlined, got_stack should include an additional frame not present in + // expected_stack. Either way, expected_stack should be a substring of + // got_stack. + EXPECT_THAT(got_stack, HasSubstr(expected_stack)); + + info->Untrack(); +} + +// Local helper functions to get different stacks for child and parent. +CordzInfo* TrackChildCord(InlineData& data, const InlineData& parent) { + CordzInfo::TrackCord(data, parent, kChildMethod); + return data.cordz_info(); +} +CordzInfo* TrackParentCord(InlineData& data) { + CordzInfo::TrackCord(data, kTrackCordMethod); + return data.cordz_info(); +} + +TEST(CordzInfoTest, GetStatistics) { + TestCordData data; + CordzInfo* info = TrackParentCord(data.data); + + CordzStatistics statistics = info->GetCordzStatistics(); + EXPECT_THAT(statistics.size, Eq(data.rep.rep->length)); + EXPECT_THAT(statistics.method, Eq(kTrackCordMethod)); + EXPECT_THAT(statistics.parent_method, Eq(kUnknownMethod)); + EXPECT_THAT(statistics.update_tracker.Value(kTrackCordMethod), Eq(1)); + + info->Untrack(); +} + +TEST(CordzInfoTest, LockCountsMethod) { + TestCordData data; + CordzInfo* info = TrackParentCord(data.data); + + info->Lock(kUpdateMethod); + info->Unlock(); + info->Lock(kUpdateMethod); + info->Unlock(); + + CordzStatistics statistics = info->GetCordzStatistics(); + EXPECT_THAT(statistics.update_tracker.Value(kUpdateMethod), Eq(2)); + + info->Untrack(); +} + +TEST(CordzInfoTest, FromParent) { + TestCordData parent; + TestCordData child; + CordzInfo* info_parent = TrackParentCord(parent.data); + CordzInfo* info_child = TrackChildCord(child.data, parent.data); + + std::string stack = FormatStack(info_parent->GetStack()); + std::string parent_stack = FormatStack(info_child->GetParentStack()); + EXPECT_THAT(stack, Eq(parent_stack)); + + CordzStatistics statistics = info_child->GetCordzStatistics(); + EXPECT_THAT(statistics.size, Eq(child.rep.rep->length)); + EXPECT_THAT(statistics.method, Eq(kChildMethod)); + EXPECT_THAT(statistics.parent_method, Eq(kTrackCordMethod)); + EXPECT_THAT(statistics.update_tracker.Value(kChildMethod), Eq(1)); + + info_parent->Untrack(); + info_child->Untrack(); +} + +} // namespace +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.cc new file mode 100644 index 0000000000..ba1270d8f0 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.cc @@ -0,0 +1,64 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/cordz_sample_token.h" + +#include "absl/base/config.h" +#include "absl/strings/internal/cordz_handle.h" +#include "absl/strings/internal/cordz_info.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +CordzSampleToken::Iterator& CordzSampleToken::Iterator::operator++() { + if (current_) { + current_ = current_->Next(*token_); + } + return *this; +} + +CordzSampleToken::Iterator CordzSampleToken::Iterator::operator++(int) { + Iterator it(*this); + operator++(); + return it; +} + +bool operator==(const CordzSampleToken::Iterator& lhs, + const CordzSampleToken::Iterator& rhs) { + return lhs.current_ == rhs.current_ && + (lhs.current_ == nullptr || lhs.token_ == rhs.token_); +} + +bool operator!=(const CordzSampleToken::Iterator& lhs, + const CordzSampleToken::Iterator& rhs) { + return !(lhs == rhs); +} + +CordzSampleToken::Iterator::reference CordzSampleToken::Iterator::operator*() + const { + return *current_; +} + +CordzSampleToken::Iterator::pointer CordzSampleToken::Iterator::operator->() + const { + return current_; +} + +CordzSampleToken::Iterator::Iterator(const CordzSampleToken* token) + : token_(token), current_(CordzInfo::Head(*token)) {} + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.h b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.h new file mode 100644 index 0000000000..28a1d70ccc --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token.h @@ -0,0 +1,97 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/base/config.h" +#include "absl/strings/internal/cordz_handle.h" +#include "absl/strings/internal/cordz_info.h" + +#ifndef ABSL_STRINGS_CORDZ_SAMPLE_TOKEN_H_ +#define ABSL_STRINGS_CORDZ_SAMPLE_TOKEN_H_ + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +// The existence of a CordzSampleToken guarantees that a reader can traverse the +// global_cordz_infos_head linked-list without needing to hold a mutex. When a +// CordzSampleToken exists, all CordzInfo objects that would be destroyed are +// instead appended to a deletion queue. When the CordzSampleToken is destroyed, +// it will also clean up any of these CordzInfo objects. +// +// E.g., ST are CordzSampleToken objects and CH are CordzHandle objects. +// ST1 <- CH1 <- CH2 <- ST2 <- CH3 <- global_delete_queue_tail +// +// This list tracks that CH1 and CH2 were created after ST1, so the thread +// holding ST1 might have a referece to CH1, CH2, ST2, and CH3. However, ST2 was +// created later, so the thread holding the ST2 token cannot have a reference to +// ST1, CH1, or CH2. If ST1 is cleaned up first, that thread will delete ST1, +// CH1, and CH2. If instead ST2 is cleaned up first, that thread will only +// delete ST2. +// +// If ST1 is cleaned up first, the new list will be: +// ST2 <- CH3 <- global_delete_queue_tail +// +// If ST2 is cleaned up first, the new list will be: +// ST1 <- CH1 <- CH2 <- CH3 <- global_delete_queue_tail +// +// All new CordzHandle objects are appended to the list, so if a new thread +// comes along before either ST1 or ST2 are cleaned up, the new list will be: +// ST1 <- CH1 <- CH2 <- ST2 <- CH3 <- ST3 <- global_delete_queue_tail +// +// A thread must hold the global_delete_queue_mu mutex whenever it's altering +// this list. +// +// It is safe for thread that holds a CordzSampleToken to read +// global_cordz_infos at any time since the objects it is able to retrieve will +// not be deleted while the CordzSampleToken exists. +class CordzSampleToken : public CordzSnapshot { + public: + class Iterator { + public: + using iterator_category = std::input_iterator_tag; + using value_type = const CordzInfo&; + using difference_type = ptrdiff_t; + using pointer = const CordzInfo*; + using reference = value_type; + + Iterator() = default; + + Iterator& operator++(); + Iterator operator++(int); + friend bool operator==(const Iterator& lhs, const Iterator& rhs); + friend bool operator!=(const Iterator& lhs, const Iterator& rhs); + reference operator*() const; + pointer operator->() const; + + private: + friend class CordzSampleToken; + explicit Iterator(const CordzSampleToken* token); + + const CordzSampleToken* token_ = nullptr; + pointer current_ = nullptr; + }; + + CordzSampleToken() = default; + CordzSampleToken(const CordzSampleToken&) = delete; + CordzSampleToken& operator=(const CordzSampleToken&) = delete; + + Iterator begin() { return Iterator(this); } + Iterator end() { return Iterator(); } +}; + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_CORDZ_SAMPLE_TOKEN_H_ diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token_test.cc new file mode 100644 index 0000000000..9f54301d68 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_sample_token_test.cc @@ -0,0 +1,208 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/cordz_sample_token.h" + +#include <memory> +#include <type_traits> +#include <vector> + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/memory/memory.h" +#include "absl/random/random.h" +#include "absl/strings/cordz_test_helpers.h" +#include "absl/strings/internal/cord_rep_flat.h" +#include "absl/strings/internal/cordz_handle.h" +#include "absl/strings/internal/cordz_info.h" +#include "absl/synchronization/internal/thread_pool.h" +#include "absl/synchronization/notification.h" +#include "absl/time/clock.h" +#include "absl/time/time.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { +namespace { + +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Ne; + +// Used test values +auto constexpr kTrackCordMethod = CordzUpdateTracker::kConstructorString; + +TEST(CordzSampleTokenTest, IteratorTraits) { + static_assert(std::is_copy_constructible<CordzSampleToken::Iterator>::value, + ""); + static_assert(std::is_copy_assignable<CordzSampleToken::Iterator>::value, ""); + static_assert(std::is_move_constructible<CordzSampleToken::Iterator>::value, + ""); + static_assert(std::is_move_assignable<CordzSampleToken::Iterator>::value, ""); + static_assert( + std::is_same< + std::iterator_traits<CordzSampleToken::Iterator>::iterator_category, + std::input_iterator_tag>::value, + ""); + static_assert( + std::is_same<std::iterator_traits<CordzSampleToken::Iterator>::value_type, + const CordzInfo&>::value, + ""); + static_assert( + std::is_same< + std::iterator_traits<CordzSampleToken::Iterator>::difference_type, + ptrdiff_t>::value, + ""); + static_assert( + std::is_same<std::iterator_traits<CordzSampleToken::Iterator>::pointer, + const CordzInfo*>::value, + ""); + static_assert( + std::is_same<std::iterator_traits<CordzSampleToken::Iterator>::reference, + const CordzInfo&>::value, + ""); +} + +TEST(CordzSampleTokenTest, IteratorEmpty) { + CordzSampleToken token; + EXPECT_THAT(token.begin(), Eq(token.end())); +} + +TEST(CordzSampleTokenTest, Iterator) { + TestCordData cord1, cord2, cord3; + CordzInfo::TrackCord(cord1.data, kTrackCordMethod); + CordzInfo* info1 = cord1.data.cordz_info(); + CordzInfo::TrackCord(cord2.data, kTrackCordMethod); + CordzInfo* info2 = cord2.data.cordz_info(); + CordzInfo::TrackCord(cord3.data, kTrackCordMethod); + CordzInfo* info3 = cord3.data.cordz_info(); + + CordzSampleToken token; + std::vector<const CordzInfo*> found; + for (const CordzInfo& cord_info : token) { + found.push_back(&cord_info); + } + + EXPECT_THAT(found, ElementsAre(info3, info2, info1)); + + info1->Untrack(); + info2->Untrack(); + info3->Untrack(); +} + +TEST(CordzSampleTokenTest, IteratorEquality) { + TestCordData cord1; + TestCordData cord2; + TestCordData cord3; + CordzInfo::TrackCord(cord1.data, kTrackCordMethod); + CordzInfo* info1 = cord1.data.cordz_info(); + + CordzSampleToken token1; + // lhs starts with the CordzInfo corresponding to cord1 at the head. + CordzSampleToken::Iterator lhs = token1.begin(); + + CordzInfo::TrackCord(cord2.data, kTrackCordMethod); + CordzInfo* info2 = cord2.data.cordz_info(); + + CordzSampleToken token2; + // rhs starts with the CordzInfo corresponding to cord2 at the head. + CordzSampleToken::Iterator rhs = token2.begin(); + + CordzInfo::TrackCord(cord3.data, kTrackCordMethod); + CordzInfo* info3 = cord3.data.cordz_info(); + + // lhs is on cord1 while rhs is on cord2. + EXPECT_THAT(lhs, Ne(rhs)); + + rhs++; + // lhs and rhs are both on cord1, but they didn't come from the same + // CordzSampleToken. + EXPECT_THAT(lhs, Ne(rhs)); + + lhs++; + rhs++; + // Both lhs and rhs are done, so they are on nullptr. + EXPECT_THAT(lhs, Eq(rhs)); + + info1->Untrack(); + info2->Untrack(); + info3->Untrack(); +} + +TEST(CordzSampleTokenTest, MultiThreaded) { + Notification stop; + static constexpr int kNumThreads = 4; + static constexpr int kNumCords = 3; + static constexpr int kNumTokens = 3; + absl::synchronization_internal::ThreadPool pool(kNumThreads); + + for (int i = 0; i < kNumThreads; ++i) { + pool.Schedule([&stop]() { + absl::BitGen gen; + TestCordData cords[kNumCords]; + std::unique_ptr<CordzSampleToken> tokens[kNumTokens]; + + while (!stop.HasBeenNotified()) { + // Randomly perform one of five actions: + // 1) Untrack + // 2) Track + // 3) Iterate over Cords visible to a token. + // 4) Unsample + // 5) Sample + int index = absl::Uniform(gen, 0, kNumCords); + if (absl::Bernoulli(gen, 0.5)) { + TestCordData& cord = cords[index]; + // Track/untrack. + if (cord.data.is_profiled()) { + // 1) Untrack + cord.data.cordz_info()->Untrack(); + cord.data.clear_cordz_info();; + } else { + // 2) Track + CordzInfo::TrackCord(cord.data, kTrackCordMethod); + } + } else { + std::unique_ptr<CordzSampleToken>& token = tokens[index]; + if (token) { + if (absl::Bernoulli(gen, 0.5)) { + // 3) Iterate over Cords visible to a token. + for (const CordzInfo& info : *token) { + // This is trivial work to allow us to compile the loop. + EXPECT_THAT(info.Next(*token), Ne(&info)); + } + } else { + // 4) Unsample + token = nullptr; + } + } else { + // 5) Sample + token = absl::make_unique<CordzSampleToken>(); + } + } + } + for (TestCordData& cord : cords) { + CordzInfo::MaybeUntrackCord(cord.data.cordz_info()); + } + }); + } + // The threads will hammer away. Give it a little bit of time for tsan to + // spot errors. + absl::SleepFor(absl::Seconds(3)); + stop.Notify(); +} + +} // namespace +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_statistics.h b/third_party/abseil-cpp/absl/strings/internal/cordz_statistics.h new file mode 100644 index 0000000000..e03c651e9c --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_statistics.h @@ -0,0 +1,84 @@ +// Copyright 2019 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_STRINGS_INTERNAL_CORDZ_STATISTICS_H_ +#define ABSL_STRINGS_INTERNAL_CORDZ_STATISTICS_H_ + +#include <cstdint> + +#include "absl/base/config.h" +#include "absl/strings/internal/cordz_update_tracker.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +// CordzStatistics captures some meta information about a Cord's shape. +struct CordzStatistics { + using MethodIdentifier = CordzUpdateTracker::MethodIdentifier; + + // Node counts information + struct NodeCounts { + size_t flat = 0; // #flats + size_t flat_64 = 0; // #flats up to 64 bytes + size_t flat_128 = 0; // #flats up to 128 bytes + size_t flat_256 = 0; // #flats up to 256 bytes + size_t flat_512 = 0; // #flats up to 512 bytes + size_t flat_1k = 0; // #flats up to 1K bytes + size_t external = 0; // #external reps + size_t substring = 0; // #substring reps + size_t concat = 0; // #concat reps + size_t ring = 0; // #ring buffer reps + }; + + // The size of the cord in bytes. This matches the result of Cord::size(). + int64_t size = 0; + + // The estimated memory used by the sampled cord. This value matches the + // value as reported by Cord::EstimatedMemoryUsage(). + // A value of 0 implies the property has not been recorded. + int64_t estimated_memory_usage = 0; + + // The effective memory used by the sampled cord, inversely weighted by the + // effective indegree of each allocated node. This is a representation of the + // fair share of memory usage that should be attributed to the sampled cord. + // This value is more useful for cases where one or more nodes are referenced + // by multiple Cord instances, and for cases where a Cord includes the same + // node multiple times (either directly or indirectly). + // A value of 0 implies the property has not been recorded. + int64_t estimated_fair_share_memory_usage = 0; + + // The total number of nodes referenced by this cord. + // For ring buffer Cords, this includes the 'ring buffer' node. + // A value of 0 implies the property has not been recorded. + int64_t node_count = 0; + + // Detailed node counts per type + NodeCounts node_counts; + + // The cord method responsible for sampling the cord. + MethodIdentifier method = MethodIdentifier::kUnknown; + + // The cord method responsible for sampling the parent cord if applicable. + MethodIdentifier parent_method = MethodIdentifier::kUnknown; + + // Update tracker tracking invocation count per cord method. + CordzUpdateTracker update_tracker; +}; + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_INTERNAL_CORDZ_STATISTICS_H_ diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope.h b/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope.h new file mode 100644 index 0000000000..57ba75de93 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope.h @@ -0,0 +1,71 @@ +// Copyright 2021 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_SCOPE_H_ +#define ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_SCOPE_H_ + +#include "absl/base/config.h" +#include "absl/base/optimization.h" +#include "absl/base/thread_annotations.h" +#include "absl/strings/internal/cord_internal.h" +#include "absl/strings/internal/cordz_info.h" +#include "absl/strings/internal/cordz_update_tracker.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +// CordzUpdateScope scopes an update to the provided CordzInfo. +// The class invokes `info->Lock(method)` and `info->Unlock()` to guard +// cordrep updates. This class does nothing if `info` is null. +// See also the 'Lock`, `Unlock` and `SetCordRep` methods in `CordzInfo`. +class ABSL_SCOPED_LOCKABLE CordzUpdateScope { + public: + CordzUpdateScope(CordzInfo* info, CordzUpdateTracker::MethodIdentifier method) + ABSL_EXCLUSIVE_LOCK_FUNCTION(info) + : info_(info) { + if (ABSL_PREDICT_FALSE(info_)) { + info->Lock(method); + } + } + + // CordzUpdateScope can not be copied or assigned to. + CordzUpdateScope(CordzUpdateScope&& rhs) = delete; + CordzUpdateScope(const CordzUpdateScope&) = delete; + CordzUpdateScope& operator=(CordzUpdateScope&& rhs) = delete; + CordzUpdateScope& operator=(const CordzUpdateScope&) = delete; + + ~CordzUpdateScope() ABSL_UNLOCK_FUNCTION() { + if (ABSL_PREDICT_FALSE(info_)) { + info_->Unlock(); + } + } + + void SetCordRep(CordRep* rep) const { + if (ABSL_PREDICT_FALSE(info_)) { + info_->SetCordRep(rep); + } + } + + CordzInfo* info() const { return info_; } + + private: + CordzInfo* info_; +}; + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_SCOPE_H_ diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope_test.cc new file mode 100644 index 0000000000..3d08c622d0 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_update_scope_test.cc @@ -0,0 +1,49 @@ +// Copyright 2021 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/cordz_update_scope.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/config.h" +#include "absl/strings/cordz_test_helpers.h" +#include "absl/strings/internal/cord_rep_flat.h" +#include "absl/strings/internal/cordz_info.h" +#include "absl/strings/internal/cordz_update_tracker.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +namespace { + +// Used test values +auto constexpr kTrackCordMethod = CordzUpdateTracker::kConstructorString; + +TEST(CordzUpdateScopeTest, ScopeNullptr) { + CordzUpdateScope scope(nullptr, kTrackCordMethod); +} + +TEST(CordzUpdateScopeTest, ScopeSampledCord) { + TestCordData cord; + CordzInfo::TrackCord(cord.data, kTrackCordMethod); + CordzUpdateScope scope(cord.data.cordz_info(), kTrackCordMethod); + cord.data.cordz_info()->SetCordRep(nullptr); +} + +} // namespace +ABSL_NAMESPACE_END +} // namespace cord_internal + +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker.h b/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker.h new file mode 100644 index 0000000000..02efcc3a2d --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker.h @@ -0,0 +1,119 @@ +// Copyright 2021 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_TRACKER_H_ +#define ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_TRACKER_H_ + +#include <atomic> +#include <cstdint> + +#include "absl/base/config.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { + +// CordzUpdateTracker tracks counters for Cord update methods. +// +// The purpose of CordzUpdateTracker is to track the number of calls to methods +// updating Cord data for sampled cords. The class internally uses 'lossy' +// atomic operations: Cord is thread-compatible, so there is no need to +// synchronize updates. However, Cordz collection threads may call 'Value()' at +// any point, so the class needs to provide thread safe access. +// +// This class is thread-safe. But as per above comments, all non-const methods +// should be used single-threaded only: updates are thread-safe but lossy. +class CordzUpdateTracker { + public: + // Tracked update methods. + enum MethodIdentifier { + kUnknown, + kAppendCord, + kAppendExternalMemory, + kAppendString, + kAssignCord, + kAssignString, + kClear, + kConstructorCord, + kConstructorString, + kCordReader, + kFlatten, + kGetAppendRegion, + kMakeCordFromExternal, + kMoveAppendCord, + kMoveAssignCord, + kMovePrependCord, + kPrependCord, + kPrependString, + kRemovePrefix, + kRemoveSuffix, + kSubCord, + + // kNumMethods defines the number of entries: must be the last entry. + kNumMethods, + }; + + // Constructs a new instance. All counters are zero-initialized. + constexpr CordzUpdateTracker() noexcept : values_{} {} + + // Copy constructs a new instance. + CordzUpdateTracker(const CordzUpdateTracker& rhs) noexcept { *this = rhs; } + + // Assigns the provided value to this instance. + CordzUpdateTracker& operator=(const CordzUpdateTracker& rhs) noexcept { + for (int i = 0; i < kNumMethods; ++i) { + values_[i].store(rhs.values_[i].load(std::memory_order_relaxed), + std::memory_order_relaxed); + } + return *this; + } + + // Returns the value for the specified method. + int64_t Value(MethodIdentifier method) const { + return values_[method].load(std::memory_order_relaxed); + } + + // Increases the value for the specified method by `n` + void LossyAdd(MethodIdentifier method, int64_t n = 1) { + auto& value = values_[method]; + value.store(value.load(std::memory_order_relaxed) + n, + std::memory_order_relaxed); + } + + // Adds all the values from `src` to this instance + void LossyAdd(const CordzUpdateTracker& src) { + for (int i = 0; i < kNumMethods; ++i) { + MethodIdentifier method = static_cast<MethodIdentifier>(i); + if (int64_t value = src.Value(method)) { + LossyAdd(method, value); + } + } + } + + private: + // Until C++20 std::atomic is not constexpr default-constructible, so we need + // a wrapper for this class to be constexpr constructible. + class Counter : public std::atomic<int64_t> { + public: + constexpr Counter() noexcept : std::atomic<int64_t>(0) {} + }; + + Counter values_[kNumMethods]; +}; + +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl + +#endif // ABSL_STRINGS_INTERNAL_CORDZ_UPDATE_TRACKER_H_ diff --git a/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker_test.cc b/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker_test.cc new file mode 100644 index 0000000000..fcd17df7a0 --- /dev/null +++ b/third_party/abseil-cpp/absl/strings/internal/cordz_update_tracker_test.cc @@ -0,0 +1,143 @@ +// Copyright 2021 The Abseil Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "absl/strings/internal/cordz_update_tracker.h" + +#include <array> +#include <thread> // NOLINT + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/base/attributes.h" +#include "absl/base/config.h" +#include "absl/synchronization/notification.h" + +namespace absl { +ABSL_NAMESPACE_BEGIN +namespace cord_internal { +namespace { + +using ::testing::AnyOf; +using ::testing::Eq; + +using Method = CordzUpdateTracker::MethodIdentifier; +using Methods = std::array<Method, Method::kNumMethods>; + +// Returns an array of all methods defined in `MethodIdentifier` +Methods AllMethods() { + return Methods{Method::kUnknown, + Method::kAppendCord, + Method::kAppendExternalMemory, + Method::kAppendString, + Method::kAssignCord, + Method::kAssignString, + Method::kClear, + Method::kConstructorCord, + Method::kConstructorString, + Method::kCordReader, + Method::kFlatten, + Method::kGetAppendRegion, + Method::kMakeCordFromExternal, + Method::kMoveAppendCord, + Method::kMoveAssignCord, + Method::kMovePrependCord, + Method::kPrependCord, + Method::kPrependString, + Method::kRemovePrefix, + Method::kRemoveSuffix, + Method::kSubCord}; +} + +TEST(CordzUpdateTracker, IsConstExprAndInitializesToZero) { + constexpr CordzUpdateTracker tracker; + for (Method method : AllMethods()) { + ASSERT_THAT(tracker.Value(method), Eq(0)); + } +} + +TEST(CordzUpdateTracker, LossyAdd) { + int64_t n = 1; + CordzUpdateTracker tracker; + for (Method method : AllMethods()) { + tracker.LossyAdd(method, n); + EXPECT_THAT(tracker.Value(method), Eq(n)); + n += 2; + } +} + +TEST(CordzUpdateTracker, CopyConstructor) { + int64_t n = 1; + CordzUpdateTracker src; + for (Method method : AllMethods()) { + src.LossyAdd(method, n); + n += 2; + } + + n = 1; + CordzUpdateTracker tracker(src); + for (Method method : AllMethods()) { + EXPECT_THAT(tracker.Value(method), Eq(n)); + n += 2; + } +} + +TEST(CordzUpdateTracker, OperatorAssign) { + int64_t n = 1; + CordzUpdateTracker src; + CordzUpdateTracker tracker; + for (Method method : AllMethods()) { + src.LossyAdd(method, n); + n += 2; + } + + n = 1; + tracker = src; + for (Method method : AllMethods()) { + EXPECT_THAT(tracker.Value(method), Eq(n)); + n += 2; + } +} + +TEST(CordzUpdateTracker, ThreadSanitizedValueCheck) { + absl::Notification done; + CordzUpdateTracker tracker; + + std::thread reader([&done, &tracker] { + while (!done.HasBeenNotified()) { + int n = 1; + for (Method method : AllMethods()) { + EXPECT_THAT(tracker.Value(method), AnyOf(Eq(n), Eq(0))); + n += 2; + } + } + int n = 1; + for (Method method : AllMethods()) { + EXPECT_THAT(tracker.Value(method), Eq(n)); + n += 2; + } + }); + + int64_t n = 1; + for (Method method : AllMethods()) { + tracker.LossyAdd(method, n); + n += 2; + } + done.Notify(); + reader.join(); +} + +} // namespace +} // namespace cord_internal +ABSL_NAMESPACE_END +} // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h b/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h index e42628e394..749c66e78e 100644 --- a/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h +++ b/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized.h @@ -17,6 +17,7 @@ #ifndef ABSL_STRINGS_INTERNAL_RESIZE_UNINITIALIZED_H_ #define ABSL_STRINGS_INTERNAL_RESIZE_UNINITIALIZED_H_ +#include <algorithm> #include <string> #include <type_traits> #include <utility> @@ -66,6 +67,28 @@ inline void STLStringResizeUninitialized(string_type* s, size_t new_size) { ResizeUninitializedTraits<string_type>::Resize(s, new_size); } +// Used to ensure exponential growth so that the amortized complexity of +// increasing the string size by a small amount is O(1), in contrast to +// O(str->size()) in the case of precise growth. +template <typename string_type> +void STLStringReserveAmortized(string_type* s, size_t new_size) { + const size_t cap = s->capacity(); + if (new_size > cap) { + // Make sure to always grow by at least a factor of 2x. + s->reserve((std::max)(new_size, 2 * cap)); + } +} + +// Like STLStringResizeUninitialized(str, new_size), except guaranteed to use +// exponential growth so that the amortized complexity of increasing the string +// size by a small amount is O(1), in contrast to O(str->size()) in the case of +// precise growth. +template <typename string_type> +void STLStringResizeUninitializedAmortized(string_type* s, size_t new_size) { + STLStringReserveAmortized(s, new_size); + STLStringResizeUninitialized(s, new_size); +} + } // namespace strings_internal ABSL_NAMESPACE_END } // namespace absl diff --git a/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized_test.cc b/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized_test.cc index 0f8b3c2a95..01ee476b6c 100644 --- a/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized_test.cc +++ b/third_party/abseil-cpp/absl/strings/internal/resize_uninitialized_test.cc @@ -24,11 +24,13 @@ int resize_call_count = 0; // resize() method has been called. struct resizable_string { size_t size() const { return 0; } + size_t capacity() const { return 0; } char& operator[](size_t) { static char c = '\0'; return c; } void resize(size_t) { resize_call_count += 1; } + void reserve(size_t) {} }; int resize_default_init_call_count = 0; @@ -37,12 +39,14 @@ int resize_default_init_call_count = 0; // resize() and __resize_default_init() methods have been called. struct resize_default_init_string { size_t size() const { return 0; } + size_t capacity() const { return 0; } char& operator[](size_t) { static char c = '\0'; return c; } void resize(size_t) { resize_call_count += 1; } void __resize_default_init(size_t) { resize_default_init_call_count += 1; } + void reserve(size_t) {} }; TEST(ResizeUninit, WithAndWithout) { @@ -60,6 +64,9 @@ TEST(ResizeUninit, WithAndWithout) { absl::strings_internal::STLStringResizeUninitialized(&rs, 237); EXPECT_EQ(resize_call_count, 1); EXPECT_EQ(resize_default_init_call_count, 0); + absl::strings_internal::STLStringResizeUninitializedAmortized(&rs, 1000); + EXPECT_EQ(resize_call_count, 2); + EXPECT_EQ(resize_default_init_call_count, 0); } resize_call_count = 0; @@ -76,7 +83,23 @@ TEST(ResizeUninit, WithAndWithout) { absl::strings_internal::STLStringResizeUninitialized(&rus, 237); EXPECT_EQ(resize_call_count, 0); EXPECT_EQ(resize_default_init_call_count, 1); + absl::strings_internal::STLStringResizeUninitializedAmortized(&rus, 1000); + EXPECT_EQ(resize_call_count, 0); + EXPECT_EQ(resize_default_init_call_count, 2); + } +} + +TEST(ResizeUninit, Amortized) { + std::string str; + size_t prev_cap = str.capacity(); + int cap_increase_count = 0; + for (int i = 0; i < 1000; ++i) { + absl::strings_internal::STLStringResizeUninitializedAmortized(&str, i); + size_t new_cap = str.capacity(); + if (new_cap > prev_cap) ++cap_increase_count; + prev_cap = new_cap; } + EXPECT_LT(cap_increase_count, 50); } } // namespace diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/arg.h b/third_party/abseil-cpp/absl/strings/internal/str_format/arg.h index 7040c86677..3c91be701f 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_format/arg.h +++ b/third_party/abseil-cpp/absl/strings/internal/str_format/arg.h @@ -122,6 +122,14 @@ StringConvertResult FormatConvertImpl(const std::string& v, StringConvertResult FormatConvertImpl(string_view v, FormatConversionSpecImpl conv, FormatSinkImpl* sink); +#if defined(ABSL_HAVE_STD_STRING_VIEW) && !defined(ABSL_USES_STD_STRING_VIEW) +inline StringConvertResult FormatConvertImpl(std::string_view v, + FormatConversionSpecImpl conv, + FormatSinkImpl* sink) { + return FormatConvertImpl(absl::string_view(v.data(), v.size()), conv, sink); +} +#endif // ABSL_HAVE_STD_STRING_VIEW && !ABSL_USES_STD_STRING_VIEW + ArgConvertResult<FormatConversionCharSetUnion( FormatConversionCharSetInternal::s, FormatConversionCharSetInternal::p)> FormatConvertImpl(const char* v, const FormatConversionSpecImpl conv, diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/bind.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/bind.cc index 4e68b90b5c..c988ba8fd2 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_format/bind.cc +++ b/third_party/abseil-cpp/absl/strings/internal/str_format/bind.cc @@ -58,7 +58,7 @@ inline bool ArgContext::Bind(const UnboundConversion* unbound, if (static_cast<size_t>(arg_position - 1) >= pack_.size()) return false; arg = &pack_[arg_position - 1]; // 1-based - if (!unbound->flags.basic) { + if (unbound->flags != Flags::kBasic) { int width = unbound->width.value(); bool force_left = false; if (unbound->width.is_from_arg()) { @@ -84,9 +84,8 @@ inline bool ArgContext::Bind(const UnboundConversion* unbound, FormatConversionSpecImplFriend::SetPrecision(precision, bound); if (force_left) { - Flags flags = unbound->flags; - flags.left = true; - FormatConversionSpecImplFriend::SetFlags(flags, bound); + FormatConversionSpecImplFriend::SetFlags(unbound->flags | Flags::kLeft, + bound); } else { FormatConversionSpecImplFriend::SetFlags(unbound->flags, bound); } diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/convert_test.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/convert_test.cc index 926283cfac..91e0360901 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_format/convert_test.cc +++ b/third_party/abseil-cpp/absl/strings/internal/str_format/convert_test.cc @@ -229,6 +229,9 @@ TEST_F(FormatConvertTest, BasicString) { TestStringConvert(static_cast<const char*>("hello")); TestStringConvert(std::string("hello")); TestStringConvert(string_view("hello")); +#if defined(ABSL_HAVE_STD_STRING_VIEW) + TestStringConvert(std::string_view("hello")); +#endif // ABSL_HAVE_STD_STRING_VIEW } TEST_F(FormatConvertTest, NullString) { diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/extension.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/extension.cc index bb0d96cf32..484f6ebfc1 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_format/extension.cc +++ b/third_party/abseil-cpp/absl/strings/internal/str_format/extension.cc @@ -23,13 +23,13 @@ namespace absl { ABSL_NAMESPACE_BEGIN namespace str_format_internal { -std::string Flags::ToString() const { +std::string FlagsToString(Flags v) { std::string s; - s.append(left ? "-" : ""); - s.append(show_pos ? "+" : ""); - s.append(sign_col ? " " : ""); - s.append(alt ? "#" : ""); - s.append(zero ? "0" : ""); + s.append(FlagsContains(v, Flags::kLeft) ? "-" : ""); + s.append(FlagsContains(v, Flags::kShowPos) ? "+" : ""); + s.append(FlagsContains(v, Flags::kSignCol) ? " " : ""); + s.append(FlagsContains(v, Flags::kAlt) ? "#" : ""); + s.append(FlagsContains(v, Flags::kZero) ? "0" : ""); return s; } diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/extension.h b/third_party/abseil-cpp/absl/strings/internal/str_format/extension.h index a9b9e137de..55cbb56d0a 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_format/extension.h +++ b/third_party/abseil-cpp/absl/strings/internal/str_format/extension.h @@ -128,19 +128,33 @@ class FormatSinkImpl { char buf_[1024]; }; -struct Flags { - bool basic : 1; // fastest conversion: no flags, width, or precision - bool left : 1; // "-" - bool show_pos : 1; // "+" - bool sign_col : 1; // " " - bool alt : 1; // "#" - bool zero : 1; // "0" - std::string ToString() const; - friend std::ostream& operator<<(std::ostream& os, const Flags& v) { - return os << v.ToString(); - } +enum class Flags : uint8_t { + kBasic = 0, + kLeft = 1 << 0, + kShowPos = 1 << 1, + kSignCol = 1 << 2, + kAlt = 1 << 3, + kZero = 1 << 4, + // This is not a real flag. It just exists to turn off kBasic when no other + // flags are set. This is for when width/precision are specified. + kNonBasic = 1 << 5, }; +constexpr Flags operator|(Flags a, Flags b) { + return static_cast<Flags>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b)); +} + +constexpr bool FlagsContains(Flags haystack, Flags needle) { + return (static_cast<uint8_t>(haystack) & static_cast<uint8_t>(needle)) == + static_cast<uint8_t>(needle); +} + +std::string FlagsToString(Flags v); + +inline std::ostream& operator<<(std::ostream& os, Flags v) { + return os << FlagsToString(v); +} + // clang-format off #define ABSL_INTERNAL_CONVERSION_CHARS_EXPAND_(X_VAL, X_SEP) \ /* text */ \ @@ -257,12 +271,16 @@ struct FormatConversionSpecImplFriend; class FormatConversionSpecImpl { public: // Width and precison are not specified, no flags are set. - bool is_basic() const { return flags_.basic; } - bool has_left_flag() const { return flags_.left; } - bool has_show_pos_flag() const { return flags_.show_pos; } - bool has_sign_col_flag() const { return flags_.sign_col; } - bool has_alt_flag() const { return flags_.alt; } - bool has_zero_flag() const { return flags_.zero; } + bool is_basic() const { return flags_ == Flags::kBasic; } + bool has_left_flag() const { return FlagsContains(flags_, Flags::kLeft); } + bool has_show_pos_flag() const { + return FlagsContains(flags_, Flags::kShowPos); + } + bool has_sign_col_flag() const { + return FlagsContains(flags_, Flags::kSignCol); + } + bool has_alt_flag() const { return FlagsContains(flags_, Flags::kAlt); } + bool has_zero_flag() const { return FlagsContains(flags_, Flags::kZero); } FormatConversionChar conversion_char() const { // Keep this field first in the struct . It generates better code when @@ -306,7 +324,7 @@ struct FormatConversionSpecImplFriend final { conv->precision_ = p; } static std::string FlagsToString(const FormatConversionSpecImpl& spec) { - return spec.flags_.ToString(); + return str_format_internal::FlagsToString(spec.flags_); } }; diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/parser.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/parser.cc index f308d02351..2c9c07dacc 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_format/parser.cc +++ b/third_party/abseil-cpp/absl/strings/internal/str_format/parser.cc @@ -34,60 +34,67 @@ namespace str_format_internal { using CC = FormatConversionCharInternal; using LM = LengthMod; +// Abbreviations to fit in the table below. +constexpr auto f_sign = Flags::kSignCol; +constexpr auto f_alt = Flags::kAlt; +constexpr auto f_pos = Flags::kShowPos; +constexpr auto f_left = Flags::kLeft; +constexpr auto f_zero = Flags::kZero; + ABSL_CONST_INIT const ConvTag kTags[256] = { - {}, {}, {}, {}, {}, {}, {}, {}, // 00-07 - {}, {}, {}, {}, {}, {}, {}, {}, // 08-0f - {}, {}, {}, {}, {}, {}, {}, {}, // 10-17 - {}, {}, {}, {}, {}, {}, {}, {}, // 18-1f - {}, {}, {}, {}, {}, {}, {}, {}, // 20-27 - {}, {}, {}, {}, {}, {}, {}, {}, // 28-2f - {}, {}, {}, {}, {}, {}, {}, {}, // 30-37 - {}, {}, {}, {}, {}, {}, {}, {}, // 38-3f - {}, CC::A, {}, {}, {}, CC::E, CC::F, CC::G, // @ABCDEFG - {}, {}, {}, {}, LM::L, {}, {}, {}, // HIJKLMNO - {}, {}, {}, {}, {}, {}, {}, {}, // PQRSTUVW - CC::X, {}, {}, {}, {}, {}, {}, {}, // XYZ[\]^_ - {}, CC::a, {}, CC::c, CC::d, CC::e, CC::f, CC::g, // `abcdefg - LM::h, CC::i, LM::j, {}, LM::l, {}, CC::n, CC::o, // hijklmno - CC::p, LM::q, {}, CC::s, LM::t, CC::u, {}, {}, // pqrstuvw - CC::x, {}, LM::z, {}, {}, {}, {}, {}, // xyz{|}! - {}, {}, {}, {}, {}, {}, {}, {}, // 80-87 - {}, {}, {}, {}, {}, {}, {}, {}, // 88-8f - {}, {}, {}, {}, {}, {}, {}, {}, // 90-97 - {}, {}, {}, {}, {}, {}, {}, {}, // 98-9f - {}, {}, {}, {}, {}, {}, {}, {}, // a0-a7 - {}, {}, {}, {}, {}, {}, {}, {}, // a8-af - {}, {}, {}, {}, {}, {}, {}, {}, // b0-b7 - {}, {}, {}, {}, {}, {}, {}, {}, // b8-bf - {}, {}, {}, {}, {}, {}, {}, {}, // c0-c7 - {}, {}, {}, {}, {}, {}, {}, {}, // c8-cf - {}, {}, {}, {}, {}, {}, {}, {}, // d0-d7 - {}, {}, {}, {}, {}, {}, {}, {}, // d8-df - {}, {}, {}, {}, {}, {}, {}, {}, // e0-e7 - {}, {}, {}, {}, {}, {}, {}, {}, // e8-ef - {}, {}, {}, {}, {}, {}, {}, {}, // f0-f7 - {}, {}, {}, {}, {}, {}, {}, {}, // f8-ff + {}, {}, {}, {}, {}, {}, {}, {}, // 00-07 + {}, {}, {}, {}, {}, {}, {}, {}, // 08-0f + {}, {}, {}, {}, {}, {}, {}, {}, // 10-17 + {}, {}, {}, {}, {}, {}, {}, {}, // 18-1f + f_sign, {}, {}, f_alt, {}, {}, {}, {}, // !"#$%&' + {}, {}, {}, f_pos, {}, f_left, {}, {}, // ()*+,-./ + f_zero, {}, {}, {}, {}, {}, {}, {}, // 01234567 + {}, {}, {}, {}, {}, {}, {}, {}, // 89:;<=>? + {}, CC::A, {}, {}, {}, CC::E, CC::F, CC::G, // @ABCDEFG + {}, {}, {}, {}, LM::L, {}, {}, {}, // HIJKLMNO + {}, {}, {}, {}, {}, {}, {}, {}, // PQRSTUVW + CC::X, {}, {}, {}, {}, {}, {}, {}, // XYZ[\]^_ + {}, CC::a, {}, CC::c, CC::d, CC::e, CC::f, CC::g, // `abcdefg + LM::h, CC::i, LM::j, {}, LM::l, {}, CC::n, CC::o, // hijklmno + CC::p, LM::q, {}, CC::s, LM::t, CC::u, {}, {}, // pqrstuvw + CC::x, {}, LM::z, {}, {}, {}, {}, {}, // xyz{|}! + {}, {}, {}, {}, {}, {}, {}, {}, // 80-87 + {}, {}, {}, {}, {}, {}, {}, {}, // 88-8f + {}, {}, {}, {}, {}, {}, {}, {}, // 90-97 + {}, {}, {}, {}, {}, {}, {}, {}, // 98-9f + {}, {}, {}, {}, {}, {}, {}, {}, // a0-a7 + {}, {}, {}, {}, {}, {}, {}, {}, // a8-af + {}, {}, {}, {}, {}, {}, {}, {}, // b0-b7 + {}, {}, {}, {}, {}, {}, {}, {}, // b8-bf + {}, {}, {}, {}, {}, {}, {}, {}, // c0-c7 + {}, {}, {}, {}, {}, {}, {}, {}, // c8-cf + {}, {}, {}, {}, {}, {}, {}, {}, // d0-d7 + {}, {}, {}, {}, {}, {}, {}, {}, // d8-df + {}, {}, {}, {}, {}, {}, {}, {}, // e0-e7 + {}, {}, {}, {}, {}, {}, {}, {}, // e8-ef + {}, {}, {}, {}, {}, {}, {}, {}, // f0-f7 + {}, {}, {}, {}, {}, {}, {}, {}, // f8-ff }; namespace { bool CheckFastPathSetting(const UnboundConversion& conv) { - bool should_be_basic = !conv.flags.left && // - !conv.flags.show_pos && // - !conv.flags.sign_col && // - !conv.flags.alt && // - !conv.flags.zero && // - (conv.width.value() == -1) && - (conv.precision.value() == -1); - if (should_be_basic != conv.flags.basic) { + bool width_precision_needed = + conv.width.value() >= 0 || conv.precision.value() >= 0; + if (width_precision_needed && conv.flags == Flags::kBasic) { fprintf(stderr, "basic=%d left=%d show_pos=%d sign_col=%d alt=%d zero=%d " "width=%d precision=%d\n", - conv.flags.basic, conv.flags.left, conv.flags.show_pos, - conv.flags.sign_col, conv.flags.alt, conv.flags.zero, - conv.width.value(), conv.precision.value()); + conv.flags == Flags::kBasic ? 1 : 0, + FlagsContains(conv.flags, Flags::kLeft) ? 1 : 0, + FlagsContains(conv.flags, Flags::kShowPos) ? 1 : 0, + FlagsContains(conv.flags, Flags::kSignCol) ? 1 : 0, + FlagsContains(conv.flags, Flags::kAlt) ? 1 : 0, + FlagsContains(conv.flags, Flags::kZero) ? 1 : 0, conv.width.value(), + conv.precision.value()); + return false; } - return should_be_basic == conv.flags.basic; + return true; } template <bool is_positional> @@ -131,40 +138,21 @@ const char *ConsumeConversion(const char *pos, const char *const end, ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR(); // We should start with the basic flag on. - assert(conv->flags.basic); + assert(conv->flags == Flags::kBasic); // Any non alpha character makes this conversion not basic. // This includes flags (-+ #0), width (1-9, *) or precision (.). // All conversion characters and length modifiers are alpha characters. if (c < 'A') { - conv->flags.basic = false; - - for (; c <= '0';) { - // FIXME: We might be able to speed this up reusing the lookup table from - // above. It might require changing Flags to be a plain integer where we - // can |= a value. - switch (c) { - case '-': - conv->flags.left = true; - break; - case '+': - conv->flags.show_pos = true; - break; - case ' ': - conv->flags.sign_col = true; - break; - case '#': - conv->flags.alt = true; - break; - case '0': - conv->flags.zero = true; - break; - default: - goto flags_done; + while (c <= '0') { + auto tag = GetTagForChar(c); + if (tag.is_flags()) { + conv->flags = conv->flags | tag.as_flags(); + ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR(); + } else { + break; } - ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR(); } -flags_done: if (c <= '9') { if (c >= '0') { @@ -173,12 +161,12 @@ flags_done: if (ABSL_PREDICT_FALSE(*next_arg != 0)) return nullptr; // Positional conversion. *next_arg = -1; - conv->flags = Flags(); - conv->flags.basic = true; return ConsumeConversion<true>(original_pos, end, conv, next_arg); } + conv->flags = conv->flags | Flags::kNonBasic; conv->width.set_value(maybe_width); } else if (c == '*') { + conv->flags = conv->flags | Flags::kNonBasic; ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR(); if (is_positional) { if (ABSL_PREDICT_FALSE(c < '1' || c > '9')) return nullptr; @@ -192,6 +180,7 @@ flags_done: } if (c == '.') { + conv->flags = conv->flags | Flags::kNonBasic; ABSL_FORMAT_PARSER_INTERNAL_GET_CHAR(); if (std::isdigit(c)) { conv->precision.set_value(parse_digits()); diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/parser.h b/third_party/abseil-cpp/absl/strings/internal/str_format/parser.h index 6504dd3ddc..ad8646edff 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_format/parser.h +++ b/third_party/abseil-cpp/absl/strings/internal/str_format/parser.h @@ -41,10 +41,7 @@ std::string LengthModToString(LengthMod v); // The analyzed properties of a single specified conversion. struct UnboundConversion { - UnboundConversion() - : flags() /* This is required to zero all the fields of flags. */ { - flags.basic = true; - } + UnboundConversion() {} class InputValue { public: @@ -79,7 +76,7 @@ struct UnboundConversion { InputValue width; InputValue precision; - Flags flags; + Flags flags = Flags::kBasic; LengthMod length_mod = LengthMod::none; FormatConversionChar conv = FormatConversionCharInternal::kNone; }; @@ -93,32 +90,43 @@ const char* ConsumeUnboundConversion(const char* p, const char* end, UnboundConversion* conv, int* next_arg); // Helper tag class for the table below. -// It allows fast `char -> ConversionChar/LengthMod` checking and +// It allows fast `char -> ConversionChar/LengthMod/Flags` checking and // conversions. class ConvTag { public: constexpr ConvTag(FormatConversionChar conversion_char) // NOLINT - : tag_(static_cast<int8_t>(conversion_char)) {} - // We invert the length modifiers to make them negative so that we can easily - // test for them. + : tag_(static_cast<uint8_t>(conversion_char)) {} constexpr ConvTag(LengthMod length_mod) // NOLINT - : tag_(~static_cast<std::int8_t>(length_mod)) {} - // Everything else is -128, which is negative to make is_conv() simpler. - constexpr ConvTag() : tag_(-128) {} + : tag_(0x80 | static_cast<uint8_t>(length_mod)) {} + constexpr ConvTag(Flags flags) // NOLINT + : tag_(0xc0 | static_cast<uint8_t>(flags)) {} + constexpr ConvTag() : tag_(0xFF) {} + + bool is_conv() const { return (tag_ & 0x80) == 0; } + bool is_length() const { return (tag_ & 0xC0) == 0x80; } + bool is_flags() const { return (tag_ & 0xE0) == 0xC0; } - bool is_conv() const { return tag_ >= 0; } - bool is_length() const { return tag_ < 0 && tag_ != -128; } FormatConversionChar as_conv() const { assert(is_conv()); + assert(!is_length()); + assert(!is_flags()); return static_cast<FormatConversionChar>(tag_); } LengthMod as_length() const { + assert(!is_conv()); assert(is_length()); - return static_cast<LengthMod>(~tag_); + assert(!is_flags()); + return static_cast<LengthMod>(tag_ & 0x3F); + } + Flags as_flags() const { + assert(!is_conv()); + assert(!is_length()); + assert(is_flags()); + return static_cast<Flags>(tag_ & 0x1F); } private: - std::int8_t tag_; + uint8_t tag_; }; extern const ConvTag kTags[256]; diff --git a/third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc b/third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc index a5fa1c79aa..fe0d296360 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc +++ b/third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc @@ -270,15 +270,22 @@ TEST_F(ConsumeUnboundConversionTest, Flags) { for (int k = 0; k < kNumFlags; ++k) if ((i >> k) & 1) fmt += kAllFlags[k]; // flag order shouldn't matter - if (rev == 1) { std::reverse(fmt.begin(), fmt.end()); } + if (rev == 1) { + std::reverse(fmt.begin(), fmt.end()); + } fmt += 'd'; SCOPED_TRACE(fmt); EXPECT_TRUE(Run(fmt.c_str())); - EXPECT_EQ(fmt.find('-') == std::string::npos, !o.flags.left); - EXPECT_EQ(fmt.find('+') == std::string::npos, !o.flags.show_pos); - EXPECT_EQ(fmt.find(' ') == std::string::npos, !o.flags.sign_col); - EXPECT_EQ(fmt.find('#') == std::string::npos, !o.flags.alt); - EXPECT_EQ(fmt.find('0') == std::string::npos, !o.flags.zero); + EXPECT_EQ(fmt.find('-') == std::string::npos, + !FlagsContains(o.flags, Flags::kLeft)); + EXPECT_EQ(fmt.find('+') == std::string::npos, + !FlagsContains(o.flags, Flags::kShowPos)); + EXPECT_EQ(fmt.find(' ') == std::string::npos, + !FlagsContains(o.flags, Flags::kSignCol)); + EXPECT_EQ(fmt.find('#') == std::string::npos, + !FlagsContains(o.flags, Flags::kAlt)); + EXPECT_EQ(fmt.find('0') == std::string::npos, + !FlagsContains(o.flags, Flags::kZero)); } } } @@ -288,14 +295,14 @@ TEST_F(ConsumeUnboundConversionTest, BasicFlag) { for (const char* fmt : {"d", "llx", "G", "1$X"}) { SCOPED_TRACE(fmt); EXPECT_TRUE(Run(fmt)); - EXPECT_TRUE(o.flags.basic); + EXPECT_EQ(o.flags, Flags::kBasic); } // Flag is off for (const char* fmt : {"3d", ".llx", "-G", "1$#X"}) { SCOPED_TRACE(fmt); EXPECT_TRUE(Run(fmt)); - EXPECT_FALSE(o.flags.basic); + EXPECT_NE(o.flags, Flags::kBasic); } } diff --git a/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h b/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h index a2f41c1531..17c1bfe8d3 100644 --- a/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h +++ b/third_party/abseil-cpp/absl/strings/internal/str_split_internal.h @@ -32,7 +32,7 @@ #include <array> #include <initializer_list> #include <iterator> -#include <map> +#include <tuple> #include <type_traits> #include <utility> #include <vector> @@ -182,6 +182,13 @@ template <typename T> struct HasConstIterator<T, absl::void_t<typename T::const_iterator>> : std::true_type {}; +// HasEmplace<T>::value is true iff there exists a method T::emplace(). +template <typename T, typename = void> +struct HasEmplace : std::false_type {}; +template <typename T> +struct HasEmplace<T, absl::void_t<decltype(std::declval<T>().emplace())>> + : std::true_type {}; + // IsInitializerList<T>::value is true iff T is an std::initializer_list. More // details below in Splitter<> where this is used. std::false_type IsInitializerListDispatch(...); // default: No @@ -372,50 +379,43 @@ class Splitter { // value. template <typename Container, typename First, typename Second> struct ConvertToContainer<Container, std::pair<const First, Second>, true> { + using iterator = typename Container::iterator; + Container operator()(const Splitter& splitter) const { Container m; - typename Container::iterator it; + iterator it; bool insert = true; - for (const auto& sp : splitter) { + for (const absl::string_view sv : splitter) { if (insert) { - it = Inserter<Container>::Insert(&m, First(sp), Second()); + it = InsertOrEmplace(&m, sv); } else { - it->second = Second(sp); + it->second = Second(sv); } insert = !insert; } return m; } - // Inserts the key and value into the given map, returning an iterator to - // the inserted item. Specialized for std::map and std::multimap to use - // emplace() and adapt emplace()'s return value. - template <typename Map> - struct Inserter { - using M = Map; - template <typename... Args> - static typename M::iterator Insert(M* m, Args&&... args) { - return m->insert(std::make_pair(std::forward<Args>(args)...)).first; - } - }; - - template <typename... Ts> - struct Inserter<std::map<Ts...>> { - using M = std::map<Ts...>; - template <typename... Args> - static typename M::iterator Insert(M* m, Args&&... args) { - return m->emplace(std::make_pair(std::forward<Args>(args)...)).first; - } - }; - - template <typename... Ts> - struct Inserter<std::multimap<Ts...>> { - using M = std::multimap<Ts...>; - template <typename... Args> - static typename M::iterator Insert(M* m, Args&&... args) { - return m->emplace(std::make_pair(std::forward<Args>(args)...)); - } - }; + // Inserts the key and an empty value into the map, returning an iterator to + // the inserted item. We use emplace() if available, otherwise insert(). + template <typename M> + static absl::enable_if_t<HasEmplace<M>::value, iterator> InsertOrEmplace( + M* m, absl::string_view key) { + // Use piecewise_construct to support old versions of gcc in which pair + // constructor can't otherwise construct string from string_view. + return ToIter(m->emplace(std::piecewise_construct, std::make_tuple(key), + std::tuple<>())); + } + template <typename M> + static absl::enable_if_t<!HasEmplace<M>::value, iterator> InsertOrEmplace( + M* m, absl::string_view key) { + return ToIter(m->insert(std::make_pair(First(key), Second("")))); + } + + static iterator ToIter(std::pair<iterator, bool> pair) { + return pair.first; + } + static iterator ToIter(iterator iter) { return iter; } }; StringType text_; diff --git a/third_party/abseil-cpp/absl/strings/numbers.h b/third_party/abseil-cpp/absl/strings/numbers.h index ffc738fa41..1780bb44bd 100644 --- a/third_party/abseil-cpp/absl/strings/numbers.h +++ b/third_party/abseil-cpp/absl/strings/numbers.h @@ -124,6 +124,7 @@ inline void PutTwoDigits(size_t i, char* buf) { } // safe_strto?() functions for implementing SimpleAtoi() + bool safe_strto32_base(absl::string_view text, int32_t* value, int base); bool safe_strto64_base(absl::string_view text, int64_t* value, int base); bool safe_strto128_base(absl::string_view text, absl::int128* value, diff --git a/third_party/abseil-cpp/absl/strings/str_cat.cc b/third_party/abseil-cpp/absl/strings/str_cat.cc index dd5d25b0d6..f4a77493a4 100644 --- a/third_party/abseil-cpp/absl/strings/str_cat.cc +++ b/third_party/abseil-cpp/absl/strings/str_cat.cc @@ -174,7 +174,7 @@ void AppendPieces(std::string* dest, ASSERT_NO_OVERLAP(*dest, piece); total_size += piece.size(); } - strings_internal::STLStringResizeUninitialized(dest, total_size); + strings_internal::STLStringResizeUninitializedAmortized(dest, total_size); char* const begin = &(*dest)[0]; char* out = begin + old_size; @@ -199,7 +199,7 @@ void StrAppend(std::string* dest, const AlphaNum& a, const AlphaNum& b) { ASSERT_NO_OVERLAP(*dest, a); ASSERT_NO_OVERLAP(*dest, b); std::string::size_type old_size = dest->size(); - strings_internal::STLStringResizeUninitialized( + strings_internal::STLStringResizeUninitializedAmortized( dest, old_size + a.size() + b.size()); char* const begin = &(*dest)[0]; char* out = begin + old_size; @@ -214,7 +214,7 @@ void StrAppend(std::string* dest, const AlphaNum& a, const AlphaNum& b, ASSERT_NO_OVERLAP(*dest, b); ASSERT_NO_OVERLAP(*dest, c); std::string::size_type old_size = dest->size(); - strings_internal::STLStringResizeUninitialized( + strings_internal::STLStringResizeUninitializedAmortized( dest, old_size + a.size() + b.size() + c.size()); char* const begin = &(*dest)[0]; char* out = begin + old_size; @@ -231,7 +231,7 @@ void StrAppend(std::string* dest, const AlphaNum& a, const AlphaNum& b, ASSERT_NO_OVERLAP(*dest, c); ASSERT_NO_OVERLAP(*dest, d); std::string::size_type old_size = dest->size(); - strings_internal::STLStringResizeUninitialized( + strings_internal::STLStringResizeUninitializedAmortized( dest, old_size + a.size() + b.size() + c.size() + d.size()); char* const begin = &(*dest)[0]; char* out = begin + old_size; diff --git a/third_party/abseil-cpp/absl/strings/str_split_test.cc b/third_party/abseil-cpp/absl/strings/str_split_test.cc index 7f7c097fae..f472f9eda1 100644 --- a/third_party/abseil-cpp/absl/strings/str_split_test.cc +++ b/third_party/abseil-cpp/absl/strings/str_split_test.cc @@ -29,6 +29,8 @@ #include "gtest/gtest.h" #include "absl/base/dynamic_annotations.h" #include "absl/base/macros.h" +#include "absl/container/btree_map.h" +#include "absl/container/btree_set.h" #include "absl/container/flat_hash_map.h" #include "absl/container/node_hash_map.h" #include "absl/strings/numbers.h" @@ -405,6 +407,10 @@ TEST(Splitter, ConversionOperator) { TestConversionOperator<std::set<std::string>>(splitter); TestConversionOperator<std::multiset<absl::string_view>>(splitter); TestConversionOperator<std::multiset<std::string>>(splitter); + TestConversionOperator<absl::btree_set<absl::string_view>>(splitter); + TestConversionOperator<absl::btree_set<std::string>>(splitter); + TestConversionOperator<absl::btree_multiset<absl::string_view>>(splitter); + TestConversionOperator<absl::btree_multiset<std::string>>(splitter); TestConversionOperator<std::unordered_set<std::string>>(splitter); // Tests conversion to map-like objects. @@ -421,6 +427,22 @@ TEST(Splitter, ConversionOperator) { TestMapConversionOperator<std::multimap<std::string, absl::string_view>>( splitter); TestMapConversionOperator<std::multimap<std::string, std::string>>(splitter); + TestMapConversionOperator< + absl::btree_map<absl::string_view, absl::string_view>>(splitter); + TestMapConversionOperator<absl::btree_map<absl::string_view, std::string>>( + splitter); + TestMapConversionOperator<absl::btree_map<std::string, absl::string_view>>( + splitter); + TestMapConversionOperator<absl::btree_map<std::string, std::string>>( + splitter); + TestMapConversionOperator< + absl::btree_multimap<absl::string_view, absl::string_view>>(splitter); + TestMapConversionOperator< + absl::btree_multimap<absl::string_view, std::string>>(splitter); + TestMapConversionOperator< + absl::btree_multimap<std::string, absl::string_view>>(splitter); + TestMapConversionOperator<absl::btree_multimap<std::string, std::string>>( + splitter); TestMapConversionOperator<std::unordered_map<std::string, std::string>>( splitter); TestMapConversionOperator< diff --git a/third_party/abseil-cpp/absl/strings/string_view.cc b/third_party/abseil-cpp/absl/strings/string_view.cc index c5f5de936d..d596e08cde 100644 --- a/third_party/abseil-cpp/absl/strings/string_view.cc +++ b/third_party/abseil-cpp/absl/strings/string_view.cc @@ -78,8 +78,8 @@ std::ostream& operator<<(std::ostream& o, string_view piece) { return o; } -string_view::size_type string_view::find(string_view s, size_type pos) const - noexcept { +string_view::size_type string_view::find(string_view s, + size_type pos) const noexcept { if (empty() || pos > length_) { if (empty() && pos == 0 && s.empty()) return 0; return npos; @@ -98,8 +98,8 @@ string_view::size_type string_view::find(char c, size_type pos) const noexcept { return result != nullptr ? result - ptr_ : npos; } -string_view::size_type string_view::rfind(string_view s, size_type pos) const - noexcept { +string_view::size_type string_view::rfind(string_view s, + size_type pos) const noexcept { if (length_ < s.length_) return npos; if (s.empty()) return std::min(length_, pos); const char* last = ptr_ + std::min(length_ - s.length_, pos) + s.length_; @@ -108,8 +108,8 @@ string_view::size_type string_view::rfind(string_view s, size_type pos) const } // Search range is [0..pos] inclusive. If pos == npos, search everything. -string_view::size_type string_view::rfind(char c, size_type pos) const - noexcept { +string_view::size_type string_view::rfind(char c, + size_type pos) const noexcept { // Note: memrchr() is not available on Windows. if (empty()) return npos; for (size_type i = std::min(pos, length_ - 1);; --i) { @@ -121,9 +121,8 @@ string_view::size_type string_view::rfind(char c, size_type pos) const return npos; } -string_view::size_type string_view::find_first_of(string_view s, - size_type pos) const - noexcept { +string_view::size_type string_view::find_first_of( + string_view s, size_type pos) const noexcept { if (empty() || s.empty()) { return npos; } @@ -138,9 +137,8 @@ string_view::size_type string_view::find_first_of(string_view s, return npos; } -string_view::size_type string_view::find_first_not_of(string_view s, - size_type pos) const - noexcept { +string_view::size_type string_view::find_first_not_of( + string_view s, size_type pos) const noexcept { if (empty()) return npos; // Avoid the cost of LookupTable() for a single-character search. if (s.length_ == 1) return find_first_not_of(s.ptr_[0], pos); @@ -153,9 +151,8 @@ string_view::size_type string_view::find_first_not_of(string_view s, return npos; } -string_view::size_type string_view::find_first_not_of(char c, - size_type pos) const - noexcept { +string_view::size_type string_view::find_first_not_of( + char c, size_type pos) const noexcept { if (empty()) return npos; for (; pos < length_; ++pos) { if (ptr_[pos] != c) { @@ -180,9 +177,8 @@ string_view::size_type string_view::find_last_of(string_view s, return npos; } -string_view::size_type string_view::find_last_not_of(string_view s, - size_type pos) const - noexcept { +string_view::size_type string_view::find_last_not_of( + string_view s, size_type pos) const noexcept { if (empty()) return npos; size_type i = std::min(pos, length_ - 1); if (s.empty()) return i; @@ -198,9 +194,8 @@ string_view::size_type string_view::find_last_not_of(string_view s, return npos; } -string_view::size_type string_view::find_last_not_of(char c, - size_type pos) const - noexcept { +string_view::size_type string_view::find_last_not_of( + char c, size_type pos) const noexcept { if (empty()) return npos; size_type i = std::min(pos, length_ - 1); for (;; --i) { diff --git a/third_party/abseil-cpp/absl/strings/string_view.h b/third_party/abseil-cpp/absl/strings/string_view.h index 5260b5b73f..968549be46 100644 --- a/third_party/abseil-cpp/absl/strings/string_view.h +++ b/third_party/abseil-cpp/absl/strings/string_view.h @@ -36,6 +36,7 @@ #include <limits> #include <string> +#include "absl/base/attributes.h" #include "absl/base/config.h" #include "absl/base/internal/throw_delegate.h" #include "absl/base/macros.h" @@ -61,6 +62,12 @@ ABSL_NAMESPACE_END #define ABSL_INTERNAL_STRING_VIEW_MEMCMP memcmp #endif // ABSL_HAVE_BUILTIN(__builtin_memcmp) +#if defined(__cplusplus) && __cplusplus >= 201402L +#define ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR constexpr +#else +#define ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR +#endif + namespace absl { ABSL_NAMESPACE_BEGIN @@ -180,8 +187,8 @@ class string_view { template <typename Allocator> string_view( // NOLINT(runtime/explicit) - const std::basic_string<char, std::char_traits<char>, Allocator>& - str) noexcept + const std::basic_string<char, std::char_traits<char>, Allocator>& str + ABSL_ATTRIBUTE_LIFETIME_BOUND) noexcept // This is implemented in terms of `string_view(p, n)` so `str.size()` // doesn't need to be reevaluated after `ptr_` is set. : string_view(str.data(), str.size()) {} @@ -264,9 +271,7 @@ class string_view { // string_view::size() // // Returns the number of characters in the `string_view`. - constexpr size_type size() const noexcept { - return length_; - } + constexpr size_type size() const noexcept { return length_; } // string_view::length() // @@ -333,7 +338,7 @@ class string_view { // // Removes the first `n` characters from the `string_view`. Note that the // underlying string is not changed, only the view. - void remove_prefix(size_type n) { + ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR void remove_prefix(size_type n) { ABSL_HARDENING_ASSERT(n <= length_); ptr_ += n; length_ -= n; @@ -343,7 +348,7 @@ class string_view { // // Removes the last `n` characters from the `string_view`. Note that the // underlying string is not changed, only the view. - void remove_suffix(size_type n) { + ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR void remove_suffix(size_type n) { ABSL_HARDENING_ASSERT(n <= length_); length_ -= n; } @@ -351,7 +356,7 @@ class string_view { // string_view::swap() // // Swaps this `string_view` with another `string_view`. - void swap(string_view& s) noexcept { + ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR void swap(string_view& s) noexcept { auto t = *this; *this = s; s = t; @@ -388,7 +393,7 @@ class string_view { // `n`) as another string_view. This function throws `std::out_of_bounds` if // `pos > size`. // Use absl::ClippedSubstr if you need a truncating substr operation. - constexpr string_view substr(size_type pos, size_type n = npos) const { + constexpr string_view substr(size_type pos = 0, size_type n = npos) const { return ABSL_PREDICT_FALSE(pos > length_) ? (base_internal::ThrowStdOutOfRange( "absl::string_view::substr"), @@ -398,12 +403,10 @@ class string_view { // string_view::compare() // - // Performs a lexicographical comparison between the `string_view` and - // another `absl::string_view`, returning -1 if `this` is less than, 0 if - // `this` is equal to, and 1 if `this` is greater than the passed string - // view. Note that in the case of data equality, a further comparison is made - // on the respective sizes of the two `string_view`s to determine which is - // smaller, equal, or greater. + // Performs a lexicographical comparison between this `string_view` and + // another `string_view` `x`, returning a negative value if `*this` is less + // than `x`, 0 if `*this` is equal to `x`, and a positive value if `*this` + // is greater than `x`. constexpr int compare(string_view x) const noexcept { return CompareImpl(length_, x.length_, Min(length_, x.length_) == 0 @@ -414,31 +417,31 @@ class string_view { // Overload of `string_view::compare()` for comparing a substring of the // 'string_view` and another `absl::string_view`. - int compare(size_type pos1, size_type count1, string_view v) const { + constexpr int compare(size_type pos1, size_type count1, string_view v) const { return substr(pos1, count1).compare(v); } // Overload of `string_view::compare()` for comparing a substring of the // `string_view` and a substring of another `absl::string_view`. - int compare(size_type pos1, size_type count1, string_view v, size_type pos2, - size_type count2) const { + constexpr int compare(size_type pos1, size_type count1, string_view v, + size_type pos2, size_type count2) const { return substr(pos1, count1).compare(v.substr(pos2, count2)); } // Overload of `string_view::compare()` for comparing a `string_view` and a - // a different C-style string `s`. - int compare(const char* s) const { return compare(string_view(s)); } + // a different C-style string `s`. + constexpr int compare(const char* s) const { return compare(string_view(s)); } // Overload of `string_view::compare()` for comparing a substring of the // `string_view` and a different string C-style string `s`. - int compare(size_type pos1, size_type count1, const char* s) const { + constexpr int compare(size_type pos1, size_type count1, const char* s) const { return substr(pos1, count1).compare(string_view(s)); } // Overload of `string_view::compare()` for comparing a substring of the // `string_view` and a substring of a different C-style string `s`. - int compare(size_type pos1, size_type count1, const char* s, - size_type count2) const { + constexpr int compare(size_type pos1, size_type count1, const char* s, + size_type count2) const { return substr(pos1, count1).compare(string_view(s, count2)); } @@ -455,48 +458,92 @@ class string_view { // within the `string_view`. size_type find(char c, size_type pos = 0) const noexcept; + // Overload of `string_view::find()` for finding a substring of a different + // C-style string `s` within the `string_view`. + size_type find(const char* s, size_type pos, size_type count) const { + return find(string_view(s, count), pos); + } + + // Overload of `string_view::find()` for finding a different C-style string + // `s` within the `string_view`. + size_type find(const char* s, size_type pos = 0) const { + return find(string_view(s), pos); + } + // string_view::rfind() // // Finds the last occurrence of a substring `s` within the `string_view`, // returning the position of the first character's match, or `npos` if no // match was found. - size_type rfind(string_view s, size_type pos = npos) const - noexcept; + size_type rfind(string_view s, size_type pos = npos) const noexcept; // Overload of `string_view::rfind()` for finding the last given character `c` // within the `string_view`. size_type rfind(char c, size_type pos = npos) const noexcept; + // Overload of `string_view::rfind()` for finding a substring of a different + // C-style string `s` within the `string_view`. + size_type rfind(const char* s, size_type pos, size_type count) const { + return rfind(string_view(s, count), pos); + } + + // Overload of `string_view::rfind()` for finding a different C-style string + // `s` within the `string_view`. + size_type rfind(const char* s, size_type pos = npos) const { + return rfind(string_view(s), pos); + } + // string_view::find_first_of() // // Finds the first occurrence of any of the characters in `s` within the // `string_view`, returning the start position of the match, or `npos` if no // match was found. - size_type find_first_of(string_view s, size_type pos = 0) const - noexcept; + size_type find_first_of(string_view s, size_type pos = 0) const noexcept; // Overload of `string_view::find_first_of()` for finding a character `c` // within the `string_view`. - size_type find_first_of(char c, size_type pos = 0) const - noexcept { + size_type find_first_of(char c, size_type pos = 0) const noexcept { return find(c, pos); } + // Overload of `string_view::find_first_of()` for finding a substring of a + // different C-style string `s` within the `string_view`. + size_type find_first_of(const char* s, size_type pos, + size_type count) const { + return find_first_of(string_view(s, count), pos); + } + + // Overload of `string_view::find_first_of()` for finding a different C-style + // string `s` within the `string_view`. + size_type find_first_of(const char* s, size_type pos = 0) const { + return find_first_of(string_view(s), pos); + } + // string_view::find_last_of() // // Finds the last occurrence of any of the characters in `s` within the // `string_view`, returning the start position of the match, or `npos` if no // match was found. - size_type find_last_of(string_view s, size_type pos = npos) const - noexcept; + size_type find_last_of(string_view s, size_type pos = npos) const noexcept; // Overload of `string_view::find_last_of()` for finding a character `c` // within the `string_view`. - size_type find_last_of(char c, size_type pos = npos) const - noexcept { + size_type find_last_of(char c, size_type pos = npos) const noexcept { return rfind(c, pos); } + // Overload of `string_view::find_last_of()` for finding a substring of a + // different C-style string `s` within the `string_view`. + size_type find_last_of(const char* s, size_type pos, size_type count) const { + return find_last_of(string_view(s, count), pos); + } + + // Overload of `string_view::find_last_of()` for finding a different C-style + // string `s` within the `string_view`. + size_type find_last_of(const char* s, size_type pos = npos) const { + return find_last_of(string_view(s), pos); + } + // string_view::find_first_not_of() // // Finds the first occurrence of any of the characters not in `s` within the @@ -508,18 +555,43 @@ class string_view { // that is not `c` within the `string_view`. size_type find_first_not_of(char c, size_type pos = 0) const noexcept; + // Overload of `string_view::find_first_not_of()` for finding a substring of a + // different C-style string `s` within the `string_view`. + size_type find_first_not_of(const char* s, size_type pos, + size_type count) const { + return find_first_not_of(string_view(s, count), pos); + } + + // Overload of `string_view::find_first_not_of()` for finding a different + // C-style string `s` within the `string_view`. + size_type find_first_not_of(const char* s, size_type pos = 0) const { + return find_first_not_of(string_view(s), pos); + } + // string_view::find_last_not_of() // // Finds the last occurrence of any of the characters not in `s` within the // `string_view`, returning the start position of the last non-match, or // `npos` if no non-match was found. size_type find_last_not_of(string_view s, - size_type pos = npos) const noexcept; + size_type pos = npos) const noexcept; // Overload of `string_view::find_last_not_of()` for finding a character // that is not `c` within the `string_view`. - size_type find_last_not_of(char c, size_type pos = npos) const - noexcept; + size_type find_last_not_of(char c, size_type pos = npos) const noexcept; + + // Overload of `string_view::find_last_not_of()` for finding a substring of a + // different C-style string `s` within the `string_view`. + size_type find_last_not_of(const char* s, size_type pos, + size_type count) const { + return find_last_not_of(string_view(s, count), pos); + } + + // Overload of `string_view::find_last_not_of()` for finding a different + // C-style string `s` within the `string_view`. + size_type find_last_not_of(const char* s, size_type pos = npos) const { + return find_last_not_of(string_view(s), pos); + } private: static constexpr size_type kMaxSize = @@ -597,6 +669,7 @@ std::ostream& operator<<(std::ostream& o, string_view piece); ABSL_NAMESPACE_END } // namespace absl +#undef ABSL_INTERNAL_STRING_VIEW_CXX14_CONSTEXPR #undef ABSL_INTERNAL_STRING_VIEW_MEMCMP #endif // ABSL_USES_STD_STRING_VIEW diff --git a/third_party/abseil-cpp/absl/strings/string_view_test.cc b/third_party/abseil-cpp/absl/strings/string_view_test.cc index 643af8f81b..2c13dd1c14 100644 --- a/third_party/abseil-cpp/absl/strings/string_view_test.cc +++ b/third_party/abseil-cpp/absl/strings/string_view_test.cc @@ -449,6 +449,24 @@ TEST(StringViewTest, STL2) { EXPECT_EQ(d.find('x', 4), absl::string_view::npos); EXPECT_EQ(e.find('x', 7), absl::string_view::npos); + EXPECT_EQ(a.find(b.data(), 1, 0), 1); + EXPECT_EQ(a.find(c.data(), 9, 0), 9); + EXPECT_EQ(a.find(c.data(), absl::string_view::npos, 0), + absl::string_view::npos); + EXPECT_EQ(b.find(c.data(), absl::string_view::npos, 0), + absl::string_view::npos); + // empty string nonsense + EXPECT_EQ(d.find(b.data(), 4, 0), absl::string_view::npos); + EXPECT_EQ(e.find(b.data(), 7, 0), absl::string_view::npos); + + EXPECT_EQ(a.find(b.data(), 1), absl::string_view::npos); + EXPECT_EQ(a.find(c.data(), 9), 23); + EXPECT_EQ(a.find(c.data(), absl::string_view::npos), absl::string_view::npos); + EXPECT_EQ(b.find(c.data(), absl::string_view::npos), absl::string_view::npos); + // empty string nonsense + EXPECT_EQ(d.find(b.data(), 4), absl::string_view::npos); + EXPECT_EQ(e.find(b.data(), 7), absl::string_view::npos); + EXPECT_EQ(a.rfind(b), 0); EXPECT_EQ(a.rfind(b, 1), 0); EXPECT_EQ(a.rfind(c), 23); @@ -490,6 +508,14 @@ TEST(StringViewTest, STL2) { EXPECT_EQ(e.rfind('o'), absl::string_view::npos); EXPECT_EQ(d.rfind('o', 4), absl::string_view::npos); EXPECT_EQ(e.rfind('o', 7), absl::string_view::npos); + + EXPECT_EQ(a.rfind(b.data(), 1, 0), 1); + EXPECT_EQ(a.rfind(c.data(), 22, 0), 22); + EXPECT_EQ(a.rfind(c.data(), 1, 0), 1); + EXPECT_EQ(a.rfind(c.data(), 0, 0), 0); + EXPECT_EQ(b.rfind(c.data(), 0, 0), 0); + EXPECT_EQ(d.rfind(b.data(), 4, 0), 0); + EXPECT_EQ(e.rfind(b.data(), 7, 0), 0); } // Continued from STL2 @@ -678,6 +704,7 @@ TEST(StringViewTest, STL2Substr) { EXPECT_EQ(a.substr(23, 3), c); EXPECT_EQ(a.substr(23, 99), c); EXPECT_EQ(a.substr(0), a); + EXPECT_EQ(a.substr(), a); EXPECT_EQ(a.substr(3, 2), "de"); // empty string nonsense EXPECT_EQ(d.substr(0, 99), e); @@ -1087,7 +1114,24 @@ TEST(StringViewTest, ConstexprCompiles) { EXPECT_EQ(sp_npos, -1); } -TEST(StringViewTest, ConstexprSubstr) { +constexpr char ConstexprMethodsHelper() { +#if defined(__cplusplus) && __cplusplus >= 201402L + absl::string_view str("123", 3); + str.remove_prefix(1); + str.remove_suffix(1); + absl::string_view bar; + str.swap(bar); + return bar.front(); +#else + return '2'; +#endif +} + +TEST(StringViewTest, ConstexprMethods) { + // remove_prefix, remove_suffix, swap + static_assert(ConstexprMethodsHelper() == '2', ""); + + // substr constexpr absl::string_view foobar("foobar", 6); constexpr absl::string_view foo = foobar.substr(0, 3); constexpr absl::string_view bar = foobar.substr(3); diff --git a/third_party/abseil-cpp/absl/synchronization/CMakeLists.txt b/third_party/abseil-cpp/absl/synchronization/CMakeLists.txt index e633d0bf53..605efe2d02 100644 --- a/third_party/abseil-cpp/absl/synchronization/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/synchronization/CMakeLists.txt @@ -95,7 +95,7 @@ absl_cc_test( DEPS absl::synchronization absl::time - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -108,7 +108,7 @@ absl_cc_test( DEPS absl::synchronization absl::time - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -122,7 +122,7 @@ absl_cc_test( absl::graphcycles_internal absl::core_headers absl::raw_logging_internal - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -154,7 +154,7 @@ absl_cc_test( absl::memory absl::raw_logging_internal absl::time - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -167,7 +167,7 @@ absl_cc_test( DEPS absl::synchronization absl::time - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -183,7 +183,7 @@ absl_cc_library( absl::config absl::strings absl::time - gmock + GTest::gmock TESTONLY ) @@ -199,7 +199,7 @@ absl_cc_test( absl::synchronization absl::strings absl::time - gmock_main + GTest::gmock_main ) absl_cc_test( diff --git a/third_party/abseil-cpp/absl/synchronization/blocking_counter.cc b/third_party/abseil-cpp/absl/synchronization/blocking_counter.cc index 3cea7aed24..d2f82da3bb 100644 --- a/third_party/abseil-cpp/absl/synchronization/blocking_counter.cc +++ b/third_party/abseil-cpp/absl/synchronization/blocking_counter.cc @@ -14,41 +14,51 @@ #include "absl/synchronization/blocking_counter.h" +#include <atomic> + #include "absl/base/internal/raw_logging.h" namespace absl { ABSL_NAMESPACE_BEGIN -// Return whether int *arg is zero. -static bool IsZero(void *arg) { - return 0 == *reinterpret_cast<int *>(arg); +namespace { + +// Return whether int *arg is true. +bool IsDone(void *arg) { return *reinterpret_cast<bool *>(arg); } + +} // namespace + +BlockingCounter::BlockingCounter(int initial_count) + : count_(initial_count), + num_waiting_(0), + done_{initial_count == 0 ? true : false} { + ABSL_RAW_CHECK(initial_count >= 0, "BlockingCounter initial_count negative"); } bool BlockingCounter::DecrementCount() { - MutexLock l(&lock_); - count_--; - if (count_ < 0) { - ABSL_RAW_LOG( - FATAL, - "BlockingCounter::DecrementCount() called too many times. count=%d", - count_); + int count = count_.fetch_sub(1, std::memory_order_acq_rel) - 1; + ABSL_RAW_CHECK(count >= 0, + "BlockingCounter::DecrementCount() called too many times"); + if (count == 0) { + MutexLock l(&lock_); + done_ = true; + return true; } - return count_ == 0; + return false; } void BlockingCounter::Wait() { MutexLock l(&this->lock_); - ABSL_RAW_CHECK(count_ >= 0, "BlockingCounter underflow"); // only one thread may call Wait(). To support more than one thread, // implement a counter num_to_exit, like in the Barrier class. ABSL_RAW_CHECK(num_waiting_ == 0, "multiple threads called Wait()"); num_waiting_++; - this->lock_.Await(Condition(IsZero, &this->count_)); + this->lock_.Await(Condition(IsDone, &this->done_)); - // At this point, We know that all threads executing DecrementCount have - // released the lock, and so will not touch this object again. + // At this point, we know that all threads executing DecrementCount + // will not touch this object again. // Therefore, the thread calling this method is free to delete the object // after we return from this method. } diff --git a/third_party/abseil-cpp/absl/synchronization/blocking_counter.h b/third_party/abseil-cpp/absl/synchronization/blocking_counter.h index 1f53f9f240..1908fdb1d9 100644 --- a/third_party/abseil-cpp/absl/synchronization/blocking_counter.h +++ b/third_party/abseil-cpp/absl/synchronization/blocking_counter.h @@ -20,6 +20,8 @@ #ifndef ABSL_SYNCHRONIZATION_BLOCKING_COUNTER_H_ #define ABSL_SYNCHRONIZATION_BLOCKING_COUNTER_H_ +#include <atomic> + #include "absl/base/thread_annotations.h" #include "absl/synchronization/mutex.h" @@ -60,8 +62,7 @@ ABSL_NAMESPACE_BEGIN // class BlockingCounter { public: - explicit BlockingCounter(int initial_count) - : count_(initial_count), num_waiting_(0) {} + explicit BlockingCounter(int initial_count); BlockingCounter(const BlockingCounter&) = delete; BlockingCounter& operator=(const BlockingCounter&) = delete; @@ -89,8 +90,9 @@ class BlockingCounter { private: Mutex lock_; - int count_ ABSL_GUARDED_BY(lock_); + std::atomic<int> count_; int num_waiting_ ABSL_GUARDED_BY(lock_); + bool done_ ABSL_GUARDED_BY(lock_); }; ABSL_NAMESPACE_END diff --git a/third_party/abseil-cpp/absl/synchronization/blocking_counter_benchmark.cc b/third_party/abseil-cpp/absl/synchronization/blocking_counter_benchmark.cc new file mode 100644 index 0000000000..b504d1a57c --- /dev/null +++ b/third_party/abseil-cpp/absl/synchronization/blocking_counter_benchmark.cc @@ -0,0 +1,83 @@ +// Copyright 2021 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <limits> + +#include "absl/synchronization/blocking_counter.h" +#include "absl/synchronization/internal/thread_pool.h" +#include "benchmark/benchmark.h" + +namespace { + +void BM_BlockingCounter_SingleThread(benchmark::State& state) { + for (auto _ : state) { + int iterations = state.range(0); + absl::BlockingCounter counter{iterations}; + for (int i = 0; i < iterations; ++i) { + counter.DecrementCount(); + } + counter.Wait(); + } +} +BENCHMARK(BM_BlockingCounter_SingleThread) + ->ArgName("iterations") + ->Arg(2) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256); + +void BM_BlockingCounter_DecrementCount(benchmark::State& state) { + static absl::BlockingCounter* counter = + new absl::BlockingCounter{std::numeric_limits<int>::max()}; + for (auto _ : state) { + counter->DecrementCount(); + } +} +BENCHMARK(BM_BlockingCounter_DecrementCount) + ->Threads(2) + ->Threads(4) + ->Threads(6) + ->Threads(8) + ->Threads(10) + ->Threads(12) + ->Threads(16) + ->Threads(32) + ->Threads(64) + ->Threads(128); + +void BM_BlockingCounter_Wait(benchmark::State& state) { + int num_threads = state.range(0); + absl::synchronization_internal::ThreadPool pool(num_threads); + for (auto _ : state) { + absl::BlockingCounter counter{num_threads}; + pool.Schedule([num_threads, &counter, &pool]() { + for (int i = 0; i < num_threads; ++i) { + pool.Schedule([&counter]() { counter.DecrementCount(); }); + } + }); + counter.Wait(); + } +} +BENCHMARK(BM_BlockingCounter_Wait) + ->ArgName("threads") + ->Arg(2) + ->Arg(4) + ->Arg(8) + ->Arg(16) + ->Arg(32) + ->Arg(64) + ->Arg(128); + +} // namespace diff --git a/third_party/abseil-cpp/absl/synchronization/blocking_counter_test.cc b/third_party/abseil-cpp/absl/synchronization/blocking_counter_test.cc index 2926224af7..06885f5759 100644 --- a/third_party/abseil-cpp/absl/synchronization/blocking_counter_test.cc +++ b/third_party/abseil-cpp/absl/synchronization/blocking_counter_test.cc @@ -63,6 +63,18 @@ TEST(BlockingCounterTest, BasicFunctionality) { } } +TEST(BlockingCounterTest, WaitZeroInitialCount) { + BlockingCounter counter(0); + counter.Wait(); +} + +#if GTEST_HAS_DEATH_TEST +TEST(BlockingCounterTest, WaitNegativeInitialCount) { + EXPECT_DEATH(BlockingCounter counter(-1), + "BlockingCounter initial_count negative"); +} +#endif + } // namespace ABSL_NAMESPACE_END } // namespace absl diff --git a/third_party/abseil-cpp/absl/synchronization/internal/per_thread_sem_test.cc b/third_party/abseil-cpp/absl/synchronization/internal/per_thread_sem_test.cc index 8cf59e64e9..db1184e679 100644 --- a/third_party/abseil-cpp/absl/synchronization/internal/per_thread_sem_test.cc +++ b/third_party/abseil-cpp/absl/synchronization/internal/per_thread_sem_test.cc @@ -159,7 +159,7 @@ TEST_F(PerThreadSemTest, Timeouts) { const absl::Duration elapsed = absl::Now() - start; // Allow for a slight early return, to account for quality of implementation // issues on various platforms. - const absl::Duration slop = absl::Microseconds(200); + const absl::Duration slop = absl::Milliseconds(1); EXPECT_LE(delay - slop, elapsed) << "Wait returned " << delay - elapsed << " early (with " << slop << " slop), start time was " << start; diff --git a/third_party/abseil-cpp/absl/synchronization/internal/waiter.cc b/third_party/abseil-cpp/absl/synchronization/internal/waiter.cc index 2123be60f5..28ef311e4a 100644 --- a/third_party/abseil-cpp/absl/synchronization/internal/waiter.cc +++ b/third_party/abseil-cpp/absl/synchronization/internal/waiter.cc @@ -79,6 +79,7 @@ bool Waiter::Wait(KernelTimeout t) { // Note that, since the thread ticker is just reset, we don't need to check // whether the thread is idle on the very first pass of the loop. bool first_pass = true; + while (true) { int32_t x = futex_.load(std::memory_order_relaxed); while (x != 0) { @@ -90,7 +91,6 @@ bool Waiter::Wait(KernelTimeout t) { return true; // Consumed a wakeup, we are done. } - if (!first_pass) MaybeBecomeIdle(); const int err = Futex::WaitUntil(&futex_, 0, t); if (err != 0) { diff --git a/third_party/abseil-cpp/absl/synchronization/mutex_test.cc b/third_party/abseil-cpp/absl/synchronization/mutex_test.cc index 058f757b48..f8fbf9488c 100644 --- a/third_party/abseil-cpp/absl/synchronization/mutex_test.cc +++ b/third_party/abseil-cpp/absl/synchronization/mutex_test.cc @@ -852,7 +852,7 @@ TEST(Mutex, MutexReaderDecrementBug) ABSL_NO_THREAD_SAFETY_ANALYSIS { // held and then destroyed (w/o unlocking). #ifdef ABSL_HAVE_THREAD_SANITIZER // TSAN reports errors when locked Mutexes are destroyed. -TEST(Mutex, DISABLED_LockedMutexDestructionBug) NO_THREAD_SAFETY_ANALYSIS { +TEST(Mutex, DISABLED_LockedMutexDestructionBug) ABSL_NO_THREAD_SAFETY_ANALYSIS { #else TEST(Mutex, LockedMutexDestructionBug) ABSL_NO_THREAD_SAFETY_ANALYSIS { #endif @@ -1153,7 +1153,7 @@ TEST(Mutex, DeadlockDetectorStressTest) ABSL_NO_THREAD_SAFETY_ANALYSIS { #ifdef ABSL_HAVE_THREAD_SANITIZER // TSAN reports errors when locked Mutexes are destroyed. -TEST(Mutex, DISABLED_DeadlockIdBug) NO_THREAD_SAFETY_ANALYSIS { +TEST(Mutex, DISABLED_DeadlockIdBug) ABSL_NO_THREAD_SAFETY_ANALYSIS { #else TEST(Mutex, DeadlockIdBug) ABSL_NO_THREAD_SAFETY_ANALYSIS { #endif diff --git a/third_party/abseil-cpp/absl/time/CMakeLists.txt b/third_party/abseil-cpp/absl/time/CMakeLists.txt index 00bdd499c1..f6ff8bd127 100644 --- a/third_party/abseil-cpp/absl/time/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/time/CMakeLists.txt @@ -102,7 +102,7 @@ absl_cc_library( absl::config absl::raw_logging_internal absl::time_zone - gmock + GTest::gmock TESTONLY ) @@ -124,5 +124,5 @@ absl_cc_test( absl::config absl::core_headers absl::time_zone - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/time/civil_time.cc b/third_party/abseil-cpp/absl/time/civil_time.cc index bdfe9ce0ef..6a231edb2d 100644 --- a/third_party/abseil-cpp/absl/time/civil_time.cc +++ b/third_party/abseil-cpp/absl/time/civil_time.cc @@ -38,9 +38,7 @@ std::string FormatYearAnd(string_view fmt, CivilSecond cs) { const CivilSecond ncs(NormalizeYear(cs.year()), cs.month(), cs.day(), cs.hour(), cs.minute(), cs.second()); const TimeZone utc = UTCTimeZone(); - // TODO(absl-team): Avoid conversion of fmt string. - return StrCat(cs.year(), - FormatTime(std::string(fmt), FromCivil(ncs, utc), utc)); + return StrCat(cs.year(), FormatTime(fmt, FromCivil(ncs, utc), utc)); } template <typename CivilT> diff --git a/third_party/abseil-cpp/absl/time/duration_test.cc b/third_party/abseil-cpp/absl/time/duration_test.cc index fb28fa987f..b7209e1c0a 100644 --- a/third_party/abseil-cpp/absl/time/duration_test.cc +++ b/third_party/abseil-cpp/absl/time/duration_test.cc @@ -17,6 +17,7 @@ #endif #include <chrono> // NOLINT(build/c++11) +#include <cfloat> #include <cmath> #include <cstdint> #include <ctime> @@ -1320,7 +1321,7 @@ TEST(Duration, SmallConversions) { EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(0)); // TODO(bww): Is the next one OK? - EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(0.124999999e-9)); + EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(std::nextafter(0.125e-9, 0))); EXPECT_EQ(absl::Nanoseconds(1) / 4, absl::Seconds(0.125e-9)); EXPECT_EQ(absl::Nanoseconds(1) / 4, absl::Seconds(0.250e-9)); EXPECT_EQ(absl::Nanoseconds(1) / 2, absl::Seconds(0.375e-9)); @@ -1330,7 +1331,7 @@ TEST(Duration, SmallConversions) { EXPECT_EQ(absl::Nanoseconds(1), absl::Seconds(0.875e-9)); EXPECT_EQ(absl::Nanoseconds(1), absl::Seconds(1.000e-9)); - EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(-0.124999999e-9)); + EXPECT_EQ(absl::ZeroDuration(), absl::Seconds(std::nextafter(-0.125e-9, 0))); EXPECT_EQ(-absl::Nanoseconds(1) / 4, absl::Seconds(-0.125e-9)); EXPECT_EQ(-absl::Nanoseconds(1) / 4, absl::Seconds(-0.250e-9)); EXPECT_EQ(-absl::Nanoseconds(1) / 2, absl::Seconds(-0.375e-9)); @@ -1390,6 +1391,14 @@ void VerifyApproxSameAsMul(double time_as_seconds, int* const misses) { // Seconds(point) returns a duration near point * Seconds(1.0). (They may // not be exactly equal due to fused multiply/add contraction.) TEST(Duration, ToDoubleSecondsCheckEdgeCases) { +#if (defined(__i386__) || defined(_M_IX86)) && FLT_EVAL_METHOD != 0 + // We're using an x87-compatible FPU, and intermediate operations can be + // performed with 80-bit floats. This means the edge cases are different than + // what we expect here, so just skip this test. + GTEST_SKIP() + << "Skipping the test because we detected x87 floating-point semantics"; +#endif + constexpr uint32_t kTicksPerSecond = absl::time_internal::kTicksPerSecond; constexpr auto duration_tick = absl::time_internal::MakeDuration(0, 1u); int misses = 0; diff --git a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_fixed.cc b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_fixed.cc index 303c0244a8..f2b3294ef7 100644 --- a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_fixed.cc +++ b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_fixed.cc @@ -53,7 +53,7 @@ int Parse02d(const char* p) { } // namespace bool FixedOffsetFromName(const std::string& name, seconds* offset) { - if (name.compare(0, std::string::npos, "UTC", 3) == 0) { + if (name == "UTC" || name == "UTC0") { *offset = seconds::zero(); return true; } diff --git a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_format_test.cc b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_format_test.cc index a11f93e2a5..294f2e2284 100644 --- a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_format_test.cc +++ b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_format_test.cc @@ -1135,7 +1135,7 @@ TEST(Parse, ExtendedSeconds) { // All %E<prec>S cases are treated the same as %E*S on input. auto precisions = {"*", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"}; - for (const std::string& prec : precisions) { + for (const std::string prec : precisions) { const std::string fmt = "%E" + prec + "S"; SCOPED_TRACE(fmt); time_point<chrono::nanoseconds> tp = unix_epoch; @@ -1217,7 +1217,7 @@ TEST(Parse, ExtendedSubeconds) { // All %E<prec>f cases are treated the same as %E*f on input. auto precisions = {"*", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"}; - for (const std::string& prec : precisions) { + for (const std::string prec : precisions) { const std::string fmt = "%E" + prec + "f"; SCOPED_TRACE(fmt); time_point<chrono::nanoseconds> tp = unix_epoch - chrono::seconds(1); diff --git a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_lookup_test.cc b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_lookup_test.cc index 9a1a8d6e40..6948c3ea2c 100644 --- a/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_lookup_test.cc +++ b/third_party/abseil-cpp/absl/time/internal/cctz/src/time_zone_lookup_test.cc @@ -717,6 +717,18 @@ TEST(TimeZones, LoadZonesConcurrently) { } #endif +TEST(TimeZone, UTC) { + const time_zone utc = utc_time_zone(); + + time_zone loaded_utc; + EXPECT_TRUE(load_time_zone("UTC", &loaded_utc)); + EXPECT_EQ(loaded_utc, utc); + + time_zone loaded_utc0; + EXPECT_TRUE(load_time_zone("UTC0", &loaded_utc0)); + EXPECT_EQ(loaded_utc0, utc); +} + TEST(TimeZone, NamedTimeZones) { const time_zone utc = utc_time_zone(); EXPECT_EQ("UTC", utc.name()); diff --git a/third_party/abseil-cpp/absl/time/time.h b/third_party/abseil-cpp/absl/time/time.h index d9ad1aedd8..48982df45a 100644 --- a/third_party/abseil-cpp/absl/time/time.h +++ b/third_party/abseil-cpp/absl/time/time.h @@ -1180,11 +1180,15 @@ inline Time FromDateTime(int64_t year, int mon, int day, int hour, // // Converts the `tm_year`, `tm_mon`, `tm_mday`, `tm_hour`, `tm_min`, and // `tm_sec` fields to an `absl::Time` using the given time zone. See ctime(3) -// for a description of the expected values of the tm fields. If the indicated -// time instant is not unique (see `absl::TimeZone::At(absl::CivilSecond)` -// above), the `tm_isdst` field is consulted to select the desired instant -// (`tm_isdst` > 0 means DST, `tm_isdst` == 0 means no DST, `tm_isdst` < 0 -// means use the post-transition offset). +// for a description of the expected values of the tm fields. If the civil time +// is unique (see `absl::TimeZone::At(absl::CivilSecond)` above), the matching +// time instant is returned. Otherwise, the `tm_isdst` field is consulted to +// choose between the possible results. For a repeated civil time, `tm_isdst != +// 0` returns the matching DST instant, while `tm_isdst == 0` returns the +// matching non-DST instant. For a skipped civil time there is no matching +// instant, so `tm_isdst != 0` returns the DST instant, and `tm_isdst == 0` +// returns the non-DST instant, that would have matched if the transition never +// happened. Time FromTM(const struct tm& tm, TimeZone tz); // ToTM() @@ -1348,7 +1352,7 @@ constexpr Duration MakeDuration(int64_t hi, int64_t lo) { inline Duration MakePosDoubleDuration(double n) { const int64_t int_secs = static_cast<int64_t>(n); const uint32_t ticks = static_cast<uint32_t>( - (n - static_cast<double>(int_secs)) * kTicksPerSecond + 0.5); + std::round((n - static_cast<double>(int_secs)) * kTicksPerSecond)); return ticks < kTicksPerSecond ? MakeDuration(int_secs, ticks) : MakeDuration(int_secs + 1, ticks - kTicksPerSecond); diff --git a/third_party/abseil-cpp/absl/types/CMakeLists.txt b/third_party/abseil-cpp/absl/types/CMakeLists.txt index c356b2117d..d7e8614e0d 100644 --- a/third_party/abseil-cpp/absl/types/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/types/CMakeLists.txt @@ -69,7 +69,7 @@ absl_cc_test( absl::exception_testing absl::raw_logging_internal absl::test_instance_tracker - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -85,7 +85,7 @@ absl_cc_test( absl::exception_testing absl::raw_logging_internal absl::test_instance_tracker - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -99,7 +99,7 @@ absl_cc_test( absl::any absl::config absl::exception_safety_testing - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -136,7 +136,7 @@ absl_cc_test( absl::inlined_vector absl::hash_testing absl::strings - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -156,7 +156,7 @@ absl_cc_test( absl::inlined_vector absl::hash_testing absl::strings - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -222,7 +222,7 @@ absl_cc_test( absl::raw_logging_internal absl::strings absl::type_traits - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -236,7 +236,7 @@ absl_cc_test( absl::optional absl::config absl::exception_safety_testing - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -258,7 +258,7 @@ absl_cc_library( absl::type_traits absl::strings absl::utility - gmock_main + GTest::gmock_main TESTONLY ) @@ -275,7 +275,7 @@ absl_cc_test( DEPS absl::conformance_testing absl::type_traits - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -288,7 +288,7 @@ absl_cc_test( DEPS absl::conformance_testing absl::type_traits - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -324,7 +324,7 @@ absl_cc_test( absl::memory absl::type_traits absl::strings - gmock_main + GTest::gmock_main ) absl_cc_library( @@ -350,7 +350,7 @@ absl_cc_test( DEPS absl::base absl::compare - gmock_main + GTest::gmock_main ) absl_cc_test( @@ -365,5 +365,5 @@ absl_cc_test( absl::config absl::exception_safety_testing absl::memory - gmock_main + GTest::gmock_main ) diff --git a/third_party/abseil-cpp/absl/types/span.h b/third_party/abseil-cpp/absl/types/span.h index 95fe79262d..41db3420db 100644 --- a/third_party/abseil-cpp/absl/types/span.h +++ b/third_party/abseil-cpp/absl/types/span.h @@ -243,8 +243,8 @@ class Span { // template <typename LazyT = T, typename = EnableIfConstView<LazyT>> - Span( - std::initializer_list<value_type> v) noexcept // NOLINT(runtime/explicit) + Span(std::initializer_list<value_type> v + ABSL_ATTRIBUTE_LIFETIME_BOUND) noexcept // NOLINT(runtime/explicit) : Span(v.begin(), v.size()) {} // Accessors diff --git a/third_party/abseil-cpp/absl/utility/CMakeLists.txt b/third_party/abseil-cpp/absl/utility/CMakeLists.txt index e1edd19aa0..865b758f23 100644 --- a/third_party/abseil-cpp/absl/utility/CMakeLists.txt +++ b/third_party/abseil-cpp/absl/utility/CMakeLists.txt @@ -40,5 +40,5 @@ absl_cc_test( absl::core_headers absl::memory absl::strings - gmock_main + GTest::gmock_main ) diff --git a/third_party/crc32c/CMakeLists.txt b/third_party/crc32c/CMakeLists.txt new file mode 100644 index 0000000000..bc720892d7 --- /dev/null +++ b/third_party/crc32c/CMakeLists.txt @@ -0,0 +1,66 @@ +android_add_library( + TARGET + crc32c + LICENSE + "BSD-3-Clause" + SRC + src/src/crc32c.cc + src/src/crc32c_portable.cc) +target_include_directories(crc32c PUBLIC config src/include) +target_compile_definitions(crc32c PRIVATE BYTE_ORDER_BIG_ENDIAN=0 + CRC32C_TESTS_BUILT_WITH_GLOG=0) +if(LINUX_AARCH64 OR DARWIN_AARCH64) + target_compile_definitions(crc32c PRIVATE HAVE_MM_PREFETCH=0 HAVE_SSE42=0) + target_sources(crc32c PRIVATE src/src/crc32c_arm64.cc) + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_definitions(crc32c PRIVATE HAVE_ARM64_CRC32C=0) + target_compile_options( + crc32c + PRIVATE "-march=armv8-a" + # Some builds set -march to a different value from the above. The + # specific feature flags below enable the instructions we need in + # these cases. See https://crbug.com/934016 for example. + "-Xclang -target-feature" + "-Xclang +crc" + "-Xclang -target-feature" + "-Xclang +crypto") + else() + target_compile_options(crc32c PRIVATE "-march=armv8-a+crc+crypto") + target_compile_definitions(crc32c PRIVATE HAVE_ARM64_CRC32C=1) + endif() +else() + target_compile_definitions(crc32c PRIVATE HAVE_MM_PREFETCH=1 HAVE_SSE42=1) + target_compile_definitions(crc32c PRIVATE HAVE_ARM64_CRC32C=0) + target_sources(crc32c PRIVATE src/src/crc32c_sse42.cc) + if(WINDOWS_MSVC_X86_64) + target_compile_options(crc32c PRIVATE -mavx) + else() + target_compile_options(crc32c PRIVATE -msse4.2) + endif() +endif() + +target_compile_definitions(crc32c PRIVATE HAVE_BUILTIN_PREFETCH=1) + +if(LINUX_AARCH64 OR LINUX_X86_64) + target_compile_definitions(crc32c PRIVATE HAVE_STRONG_GETAUXVAL=1 + HAVE_WEAK_GETAUXVAL=1) + +else() + target_compile_definitions(crc32c PRIVATE HAVE_STRONG_GETAUXVAL=0 + HAVE_WEAK_GETAUXVAL=0) +endif() + +android_add_test( + TARGET + crc32c_tests + SRC + "src/src/crc32c_arm64_unittest.cc" + "src/src/crc32c_extend_unittests.h" + "src/src/crc32c_portable_unittest.cc" + "src/src/crc32c_prefetch_unittest.cc" + "src/src/crc32c_read_le_unittest.cc" + "src/src/crc32c_round_up_unittest.cc" + "src/src/crc32c_sse42_unittest.cc" + "src/src/crc32c_unittest.cc") + +target_link_libraries(crc32c_tests PRIVATE crc32c gmock_main) diff --git a/third_party/crc32c/config/crc32c/crc32c_config.h b/third_party/crc32c/config/crc32c/crc32c_config.h new file mode 100644 index 0000000000..3589fa678c --- /dev/null +++ b/third_party/crc32c/config/crc32c/crc32c_config.h @@ -0,0 +1,6 @@ +// Copyright 2017 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This is a stub. The preprocessor macros that are usually defined here are +// supplied by BUILD.gn instead. diff --git a/third_party/crc32c/src/include/crc32c/crc32c.h b/third_party/crc32c/src/include/crc32c/crc32c.h new file mode 100644 index 0000000000..e8a78170a9 --- /dev/null +++ b/third_party/crc32c/src/include/crc32c/crc32c.h @@ -0,0 +1,89 @@ +/* Copyright 2017 The CRC32C Authors. All rights reserved. + Use of this source code is governed by a BSD-style license that can be + found in the LICENSE file. See the AUTHORS file for names of contributors. */ + +#ifndef CRC32C_CRC32C_H_ +#define CRC32C_CRC32C_H_ + +/* The API exported by the CRC32C project. */ + +#if defined(__cplusplus) + +#include <cstddef> +#include <cstdint> +#include <string> + +#else /* !defined(__cplusplus) */ + +#include <stddef.h> +#include <stdint.h> + +#endif /* !defined(__cplusplus) */ + + +/* The C API. */ + +#if defined(__cplusplus) +extern "C" { +#endif /* defined(__cplusplus) */ + +/* Extends "crc" with the CRC32C of "count" bytes in the buffer pointed by + "data" */ +uint32_t crc32c_extend(uint32_t crc, const uint8_t* data, size_t count); + +/* Computes the CRC32C of "count" bytes in the buffer pointed by "data". */ +uint32_t crc32c_value(const uint8_t* data, size_t count); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif /* defined(__cplusplus) */ + + +/* The C++ API. */ + +#if defined(__cplusplus) + +namespace crc32c { + +// Extends "crc" with the CRC32C of "count" bytes in the buffer pointed by +// "data". +uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count); + +// Computes the CRC32C of "count" bytes in the buffer pointed by "data". +inline uint32_t Crc32c(const uint8_t* data, size_t count) { + return Extend(0, data, count); +} + +// Computes the CRC32C of "count" bytes in the buffer pointed by "data". +inline uint32_t Crc32c(const char* data, size_t count) { + return Extend(0, reinterpret_cast<const uint8_t*>(data), count); +} + +// Computes the CRC32C of the string's content. +inline uint32_t Crc32c(const std::string& string) { + return Crc32c(reinterpret_cast<const uint8_t*>(string.data()), + string.size()); +} + +} // namespace crc32c + +#if __cplusplus > 201402L +#if __has_include(<string_view>) +#include <string_view> + +namespace crc32c { + +// Computes the CRC32C of the bytes in the string_view. +inline uint32_t Crc32c(const std::string_view& string_view) { + return Crc32c(reinterpret_cast<const uint8_t*>(string_view.data()), + string_view.size()); +} + +} // namespace crc32c + +#endif // __has_include(<string_view>) +#endif // __cplusplus > 201402L + +#endif /* defined(__cplusplus) */ + +#endif // CRC32C_CRC32C_H_ diff --git a/third_party/crc32c/src/src/crc32c.cc b/third_party/crc32c/src/src/crc32c.cc new file mode 100644 index 0000000000..4d3018af47 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c.cc @@ -0,0 +1,39 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "crc32c/crc32c.h" + +#include <cstddef> +#include <cstdint> + +#include "./crc32c_arm64.h" +#include "./crc32c_arm64_linux_check.h" +#include "./crc32c_internal.h" +#include "./crc32c_sse42.h" +#include "./crc32c_sse42_check.h" + +namespace crc32c { + +uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) { +#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + static bool can_use_sse42 = CanUseSse42(); + if (can_use_sse42) return ExtendSse42(crc, data, count); +#elif HAVE_ARM64_CRC32C + static bool can_use_arm_linux = CanUseArm64Linux(); + if (can_use_arm_linux) return ExtendArm64(crc, data, count); +#endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + + return ExtendPortable(crc, data, count); +} + +extern "C" uint32_t crc32c_extend(uint32_t crc, const uint8_t* data, + size_t count) { + return crc32c::Extend(crc, data, count); +} + +extern "C" uint32_t crc32c_value(const uint8_t* data, size_t count) { + return crc32c::Crc32c(data, count); +} + +} // namespace crc32c diff --git a/third_party/crc32c/src/src/crc32c_arm64.cc b/third_party/crc32c/src/src/crc32c_arm64.cc new file mode 100644 index 0000000000..9a988c1eed --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_arm64.cc @@ -0,0 +1,124 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "./crc32c_arm64.h" + +// In a separate source file to allow this accelerated CRC32C function to be +// compiled with the appropriate compiler flags to enable ARM NEON CRC32C +// instructions. + +// This implementation is based on https://github.com/google/leveldb/pull/490. + +#include <cstddef> +#include <cstdint> + +#include "./crc32c_internal.h" +#include "crc32c/crc32c_config.h" + +#if HAVE_ARM64_CRC32C + +#include <arm_acle.h> +#include <arm_neon.h> + +#define KBYTES 1032 +#define SEGMENTBYTES 256 + +// compute 8bytes for each segment parallelly +#define CRC32C32BYTES(P, IND) \ + do { \ + crc1 = __crc32cd( \ + crc1, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 1 + (IND))); \ + crc2 = __crc32cd( \ + crc2, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 2 + (IND))); \ + crc3 = __crc32cd( \ + crc3, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 3 + (IND))); \ + crc0 = __crc32cd( \ + crc0, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 0 + (IND))); \ + } while (0); + +// compute 8*8 bytes for each segment parallelly +#define CRC32C256BYTES(P, IND) \ + do { \ + CRC32C32BYTES((P), (IND)*8 + 0) \ + CRC32C32BYTES((P), (IND)*8 + 1) \ + CRC32C32BYTES((P), (IND)*8 + 2) \ + CRC32C32BYTES((P), (IND)*8 + 3) \ + CRC32C32BYTES((P), (IND)*8 + 4) \ + CRC32C32BYTES((P), (IND)*8 + 5) \ + CRC32C32BYTES((P), (IND)*8 + 6) \ + CRC32C32BYTES((P), (IND)*8 + 7) \ + } while (0); + +// compute 4*8*8 bytes for each segment parallelly +#define CRC32C1024BYTES(P) \ + do { \ + CRC32C256BYTES((P), 0) \ + CRC32C256BYTES((P), 1) \ + CRC32C256BYTES((P), 2) \ + CRC32C256BYTES((P), 3) \ + (P) += 4 * SEGMENTBYTES; \ + } while (0) + +namespace crc32c { + +uint32_t ExtendArm64(uint32_t crc, const uint8_t *buf, size_t size) { + int64_t length = size; + uint32_t crc0, crc1, crc2, crc3; + uint64_t t0, t1, t2; + + // k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)), + // k2=CRC(x^(SEGMENTBYTES*8)) + const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4; + + crc = crc ^ kCRC32Xor; + const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); + + while (length >= KBYTES) { + crc0 = crc; + crc1 = 0; + crc2 = 0; + crc3 = 0; + + // Process 1024 bytes in parallel. + CRC32C1024BYTES(p); + + // Merge the 4 partial CRC32C values. + t2 = (uint64_t)vmull_p64(crc2, k2); + t1 = (uint64_t)vmull_p64(crc1, k1); + t0 = (uint64_t)vmull_p64(crc0, k0); + crc = __crc32cd(crc3, *(uint64_t *)p); + p += sizeof(uint64_t); + crc ^= __crc32cd(0, t2); + crc ^= __crc32cd(0, t1); + crc ^= __crc32cd(0, t0); + + length -= KBYTES; + } + + while (length >= 8) { + crc = __crc32cd(crc, *(uint64_t *)p); + p += 8; + length -= 8; + } + + if (length & 4) { + crc = __crc32cw(crc, *(uint32_t *)p); + p += 4; + } + + if (length & 2) { + crc = __crc32ch(crc, *(uint16_t *)p); + p += 2; + } + + if (length & 1) { + crc = __crc32cb(crc, *p); + } + + return crc ^ kCRC32Xor; +} + +} // namespace crc32c + +#endif // HAVE_ARM64_CRC32C diff --git a/third_party/crc32c/src/src/crc32c_arm64.h b/third_party/crc32c/src/src/crc32c_arm64.h new file mode 100644 index 0000000000..bbdece46c7 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_arm64.h @@ -0,0 +1,25 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Linux-specific code checking the availability for ARM CRC32C instructions. + +#ifndef CRC32C_CRC32C_ARM_LINUX_H_ +#define CRC32C_CRC32C_ARM_LINUX_H_ + +#include <cstddef> +#include <cstdint> + +#include "crc32c/crc32c_config.h" + +#if HAVE_ARM64_CRC32C + +namespace crc32c { + +uint32_t ExtendArm64(uint32_t crc, const uint8_t* data, size_t count); + +} // namespace crc32c + +#endif // HAVE_ARM64_CRC32C + +#endif // CRC32C_CRC32C_ARM_LINUX_H_ diff --git a/third_party/crc32c/src/src/crc32c_arm64_linux_check.h b/third_party/crc32c/src/src/crc32c_arm64_linux_check.h new file mode 100644 index 0000000000..6817979aac --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_arm64_linux_check.h @@ -0,0 +1,48 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// ARM Linux-specific code checking for the availability of CRC32C instructions. + +#ifndef CRC32C_CRC32C_ARM_LINUX_CHECK_H_ +#define CRC32C_CRC32C_ARM_LINUX_CHECK_H_ + +// X86-specific code checking for the availability of SSE4.2 instructions. + +#include <cstddef> +#include <cstdint> + +#include "crc32c/crc32c_config.h" + +#if HAVE_ARM64_CRC32C + +#if HAVE_STRONG_GETAUXVAL +#include <sys/auxv.h> +#elif HAVE_WEAK_GETAUXVAL +// getauxval() is not available on Android until API level 20. Link it as a weak +// symbol. +extern "C" unsigned long getauxval(unsigned long type) __attribute__((weak)); + +#define AT_HWCAP 16 +#endif // HAVE_STRONG_GETAUXVAL || HAVE_WEAK_GETAUXVAL + +namespace crc32c { + +inline bool CanUseArm64Linux() { +#if HAVE_STRONG_GETAUXVAL || HAVE_WEAK_GETAUXVAL + // From 'arch/arm64/include/uapi/asm/hwcap.h' in Linux kernel source code. + constexpr unsigned long kHWCAP_PMULL = 1 << 4; + constexpr unsigned long kHWCAP_CRC32 = 1 << 7; + unsigned long hwcap = (&getauxval != nullptr) ? getauxval(AT_HWCAP) : 0; + return (hwcap & (kHWCAP_PMULL | kHWCAP_CRC32)) == + (kHWCAP_PMULL | kHWCAP_CRC32); +#else + return false; +#endif // HAVE_STRONG_GETAUXVAL || HAVE_WEAK_GETAUXVAL +} + +} // namespace crc32c + +#endif // HAVE_ARM64_CRC32C + +#endif // CRC32C_CRC32C_ARM_LINUX_CHECK_H_ diff --git a/third_party/crc32c/src/src/crc32c_arm64_unittest.cc b/third_party/crc32c/src/src/crc32c_arm64_unittest.cc new file mode 100644 index 0000000000..6f917d9c0c --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_arm64_unittest.cc @@ -0,0 +1,24 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "gtest/gtest.h" + +#include "./crc32c_arm64.h" +#include "./crc32c_extend_unittests.h" + +namespace crc32c { + +#if HAVE_ARM64_CRC32C + +struct Arm64TestTraits { + static uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) { + return ExtendArm64(crc, data, count); + } +}; + +INSTANTIATE_TYPED_TEST_SUITE_P(Arm64, ExtendTest, Arm64TestTraits); + +#endif // HAVE_ARM64_CRC32C + +} // namespace crc32c diff --git a/third_party/crc32c/src/src/crc32c_benchmark.cc b/third_party/crc32c/src/src/crc32c_benchmark.cc new file mode 100644 index 0000000000..d31af92256 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_benchmark.cc @@ -0,0 +1,104 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include <cstddef> +#include <cstdint> + +#include "crc32c/crc32c_config.h" + +#include "benchmark/benchmark.h" + +#if CRC32C_TESTS_BUILT_WITH_GLOG +#include "glog/logging.h" +#endif // CRC32C_TESTS_BUILT_WITH_GLOG + +#include "./crc32c_arm64.h" +#include "./crc32c_arm64_linux_check.h" +#include "./crc32c_internal.h" +#include "./crc32c_sse42.h" +#include "./crc32c_sse42_check.h" +#include "crc32c/crc32c.h" + +class CRC32CBenchmark : public benchmark::Fixture { + public: + void SetUp(const benchmark::State& state) override { + block_size_ = static_cast<size_t>(state.range(0)); + block_data_ = std::string(block_size_, 'x'); + block_buffer_ = reinterpret_cast<const uint8_t*>(block_data_.data()); + } + + protected: + std::string block_data_; + const uint8_t* block_buffer_; + size_t block_size_; +}; + +BENCHMARK_DEFINE_F(CRC32CBenchmark, Public)(benchmark::State& state) { + uint32_t crc = 0; + for (auto _ : state) + crc = crc32c::Extend(crc, block_buffer_, block_size_); + state.SetBytesProcessed(state.iterations() * block_size_); +} +BENCHMARK_REGISTER_F(CRC32CBenchmark, Public) + ->RangeMultiplier(16) + ->Range(256, 16777216); // Block size. + +BENCHMARK_DEFINE_F(CRC32CBenchmark, Portable)(benchmark::State& state) { + uint32_t crc = 0; + for (auto _ : state) + crc = crc32c::ExtendPortable(crc, block_buffer_, block_size_); + state.SetBytesProcessed(state.iterations() * block_size_); +} +BENCHMARK_REGISTER_F(CRC32CBenchmark, Portable) + ->RangeMultiplier(16) + ->Range(256, 16777216); // Block size. + +#if HAVE_ARM64_CRC32C + +BENCHMARK_DEFINE_F(CRC32CBenchmark, ArmLinux)(benchmark::State& state) { + if (!crc32c::CanUseArm64Linux()) { + state.SkipWithError("ARM CRC32C instructions not available or not enabled"); + return; + } + + uint32_t crc = 0; + for (auto _ : state) + crc = crc32c::ExtendArm64(crc, block_buffer_, block_size_); + state.SetBytesProcessed(state.iterations() * block_size_); +} +BENCHMARK_REGISTER_F(CRC32CBenchmark, ArmLinux) + ->RangeMultiplier(16) + ->Range(256, 16777216); // Block size. + +#endif // HAVE_ARM64_CRC32C + +#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +BENCHMARK_DEFINE_F(CRC32CBenchmark, Sse42)(benchmark::State& state) { + if (!crc32c::CanUseSse42()) { + state.SkipWithError("SSE4.2 instructions not available or not enabled"); + return; + } + + uint32_t crc = 0; + for (auto _ : state) + crc = crc32c::ExtendSse42(crc, block_buffer_, block_size_); + state.SetBytesProcessed(state.iterations() * block_size_); +} +BENCHMARK_REGISTER_F(CRC32CBenchmark, Sse42) + ->RangeMultiplier(16) + ->Range(256, 16777216); // Block size. + +#endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +int main(int argc, char** argv) { +#if CRC32C_TESTS_BUILT_WITH_GLOG + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); +#endif // CRC32C_TESTS_BUILT_WITH_GLOG + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/third_party/crc32c/src/src/crc32c_capi_unittest.c b/third_party/crc32c/src/src/crc32c_capi_unittest.c new file mode 100644 index 0000000000..c8993a0959 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_capi_unittest.c @@ -0,0 +1,66 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "crc32c/crc32c.h" + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +int main() { + /* From rfc3720 section B.4. */ + uint8_t buf[32]; + + memset(buf, 0, sizeof(buf)); + if ((uint32_t)0x8a9136aa != crc32c_value(buf, sizeof(buf))) { + printf("crc32c_value(zeros) test failed\n"); + return 1; + } + + memset(buf, 0xff, sizeof(buf)); + if ((uint32_t)0x62a8ab43 != crc32c_value(buf, sizeof(buf))) { + printf("crc32c_value(0xff) test failed\n"); + return 1; + } + + for (size_t i = 0; i < 32; ++i) + buf[i] = (uint8_t)i; + if ((uint32_t)0x46dd794e != crc32c_value(buf, sizeof(buf))) { + printf("crc32c_value(0..31) test failed\n"); + return 1; + } + + for (size_t i = 0; i < 32; ++i) + buf[i] = (uint8_t)(31 - i); + if ((uint32_t)0x113fdb5c != crc32c_value(buf, sizeof(buf))) { + printf("crc32c_value(31..0) test failed\n"); + return 1; + } + + uint8_t data[48] = { + 0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + if ((uint32_t)0xd9963a56 != crc32c_value(data, sizeof(data))) { + printf("crc32c_value(31..0) test failed\n"); + return 1; + } + + const uint8_t* hello_space_world = (const uint8_t*)"hello world"; + const uint8_t* hello_space = (const uint8_t*)"hello "; + const uint8_t* world = (const uint8_t*)"world"; + + if (crc32c_value(hello_space_world, 11) != + crc32c_extend(crc32c_value(hello_space, 6), world, 5)) { + printf("crc32c_extend test failed\n"); + return 1; + } + + printf("All tests passed\n"); + return 0; +} diff --git a/third_party/crc32c/src/src/crc32c_extend_unittests.h b/third_party/crc32c/src/src/crc32c_extend_unittests.h new file mode 100644 index 0000000000..0732973737 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_extend_unittests.h @@ -0,0 +1,112 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef CRC32C_CRC32C_EXTEND_UNITTESTS_H_ +#define CRC32C_CRC32C_EXTEND_UNITTESTS_H_ + +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "gtest/gtest.h" + +// Common test cases for all implementations of CRC32C_Extend(). + +namespace crc32c { + +template<typename TestTraits> +class ExtendTest : public testing::Test {}; + +TYPED_TEST_SUITE_P(ExtendTest); + +TYPED_TEST_P(ExtendTest, StandardResults) { + // From rfc3720 section B.4. + uint8_t buf[32]; + + std::memset(buf, 0, sizeof(buf)); + EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa), + TypeParam::Extend(0, buf, sizeof(buf))); + + std::memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43), + TypeParam::Extend(0, buf, sizeof(buf))); + + for (int i = 0; i < 32; ++i) + buf[i] = static_cast<uint8_t>(i); + EXPECT_EQ(static_cast<uint32_t>(0x46dd794e), + TypeParam::Extend(0, buf, sizeof(buf))); + + for (int i = 0; i < 32; ++i) + buf[i] = static_cast<uint8_t>(31 - i); + EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c), + TypeParam::Extend(0, buf, sizeof(buf))); + + uint8_t data[48] = { + 0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + EXPECT_EQ(static_cast<uint32_t>(0xd9963a56), + TypeParam::Extend(0, data, sizeof(data))); +} + +TYPED_TEST_P(ExtendTest, HelloWorld) { + const uint8_t* hello_space_world = + reinterpret_cast<const uint8_t*>("hello world"); + const uint8_t* hello_space = reinterpret_cast<const uint8_t*>("hello "); + const uint8_t* world = reinterpret_cast<const uint8_t*>("world"); + + EXPECT_EQ(TypeParam::Extend(0, hello_space_world, 11), + TypeParam::Extend(TypeParam::Extend(0, hello_space, 6), world, 5)); +} + +TYPED_TEST_P(ExtendTest, BufferSlicing) { + uint8_t buffer[48] = { + 0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + + for (size_t i = 0; i < 48; ++i) { + for (size_t j = i + 1; j <= 48; ++j) { + uint32_t crc = 0; + + if (i > 0) crc = TypeParam::Extend(crc, buffer, i); + crc = TypeParam::Extend(crc, buffer + i, j - i); + if (j < 48) crc = TypeParam::Extend(crc, buffer + j, 48 - j); + + EXPECT_EQ(static_cast<uint32_t>(0xd9963a56), crc); + } + } +} + +TYPED_TEST_P(ExtendTest, LargeBufferSlicing) { + uint8_t buffer[2048]; + for (size_t i = 0; i < 2048; i++) + buffer[i] = static_cast<uint8_t>(3 * i * i + 7 * i + 11); + + for (size_t i = 0; i < 2048; ++i) { + for (size_t j = i + 1; j <= 2048; ++j) { + uint32_t crc = 0; + + if (i > 0) crc = TypeParam::Extend(crc, buffer, i); + crc = TypeParam::Extend(crc, buffer + i, j - i); + if (j < 2048) crc = TypeParam::Extend(crc, buffer + j, 2048 - j); + + EXPECT_EQ(static_cast<uint32_t>(0x36dcc753), crc); + } + } +} + +REGISTER_TYPED_TEST_SUITE_P(ExtendTest, + StandardResults, + HelloWorld, + BufferSlicing, + LargeBufferSlicing); + +} // namespace crc32c + +#endif // CRC32C_CRC32C_EXTEND_UNITTESTS_H_ diff --git a/third_party/crc32c/src/src/crc32c_internal.h b/third_party/crc32c/src/src/crc32c_internal.h new file mode 100644 index 0000000000..2bd23dea43 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_internal.h @@ -0,0 +1,23 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef CRC32C_CRC32C_INTERNAL_H_ +#define CRC32C_CRC32C_INTERNAL_H_ + +// Internal functions that may change between releases. + +#include <cstddef> +#include <cstdint> + +namespace crc32c { + +// Un-accelerated implementation that works on all CPUs. +uint32_t ExtendPortable(uint32_t crc, const uint8_t* data, size_t count); + +// CRCs are pre- and post- conditioned by xoring with all ones. +static constexpr const uint32_t kCRC32Xor = static_cast<uint32_t>(0xffffffffU); + +} // namespace crc32c + +#endif // CRC32C_CRC32C_INTERNAL_H_ diff --git a/third_party/crc32c/src/src/crc32c_portable.cc b/third_party/crc32c/src/src/crc32c_portable.cc new file mode 100644 index 0000000000..31ec6eac53 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_portable.cc @@ -0,0 +1,351 @@ +// Copyright 2008 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "./crc32c_internal.h" + +#include <cstddef> +#include <cstdint> + +#include "./crc32c_prefetch.h" +#include "./crc32c_read_le.h" +#include "./crc32c_round_up.h" + +namespace { + +const uint32_t kByteExtensionTable[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, + 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, + 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, + 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, + 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, + 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, + 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, + 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, + 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, + 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, + 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, + 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, + 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, + 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, + 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, + 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, + 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, + 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, + 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, + 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, + 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, + 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351}; + +const uint32_t kStrideExtensionTable0[256] = { + 0x00000000, 0x30d23865, 0x61a470ca, 0x517648af, 0xc348e194, 0xf39ad9f1, + 0xa2ec915e, 0x923ea93b, 0x837db5d9, 0xb3af8dbc, 0xe2d9c513, 0xd20bfd76, + 0x4035544d, 0x70e76c28, 0x21912487, 0x11431ce2, 0x03171d43, 0x33c52526, + 0x62b36d89, 0x526155ec, 0xc05ffcd7, 0xf08dc4b2, 0xa1fb8c1d, 0x9129b478, + 0x806aa89a, 0xb0b890ff, 0xe1ced850, 0xd11ce035, 0x4322490e, 0x73f0716b, + 0x228639c4, 0x125401a1, 0x062e3a86, 0x36fc02e3, 0x678a4a4c, 0x57587229, + 0xc566db12, 0xf5b4e377, 0xa4c2abd8, 0x941093bd, 0x85538f5f, 0xb581b73a, + 0xe4f7ff95, 0xd425c7f0, 0x461b6ecb, 0x76c956ae, 0x27bf1e01, 0x176d2664, + 0x053927c5, 0x35eb1fa0, 0x649d570f, 0x544f6f6a, 0xc671c651, 0xf6a3fe34, + 0xa7d5b69b, 0x97078efe, 0x8644921c, 0xb696aa79, 0xe7e0e2d6, 0xd732dab3, + 0x450c7388, 0x75de4bed, 0x24a80342, 0x147a3b27, 0x0c5c750c, 0x3c8e4d69, + 0x6df805c6, 0x5d2a3da3, 0xcf149498, 0xffc6acfd, 0xaeb0e452, 0x9e62dc37, + 0x8f21c0d5, 0xbff3f8b0, 0xee85b01f, 0xde57887a, 0x4c692141, 0x7cbb1924, + 0x2dcd518b, 0x1d1f69ee, 0x0f4b684f, 0x3f99502a, 0x6eef1885, 0x5e3d20e0, + 0xcc0389db, 0xfcd1b1be, 0xada7f911, 0x9d75c174, 0x8c36dd96, 0xbce4e5f3, + 0xed92ad5c, 0xdd409539, 0x4f7e3c02, 0x7fac0467, 0x2eda4cc8, 0x1e0874ad, + 0x0a724f8a, 0x3aa077ef, 0x6bd63f40, 0x5b040725, 0xc93aae1e, 0xf9e8967b, + 0xa89eded4, 0x984ce6b1, 0x890ffa53, 0xb9ddc236, 0xe8ab8a99, 0xd879b2fc, + 0x4a471bc7, 0x7a9523a2, 0x2be36b0d, 0x1b315368, 0x096552c9, 0x39b76aac, + 0x68c12203, 0x58131a66, 0xca2db35d, 0xfaff8b38, 0xab89c397, 0x9b5bfbf2, + 0x8a18e710, 0xbacadf75, 0xebbc97da, 0xdb6eafbf, 0x49500684, 0x79823ee1, + 0x28f4764e, 0x18264e2b, 0x18b8ea18, 0x286ad27d, 0x791c9ad2, 0x49cea2b7, + 0xdbf00b8c, 0xeb2233e9, 0xba547b46, 0x8a864323, 0x9bc55fc1, 0xab1767a4, + 0xfa612f0b, 0xcab3176e, 0x588dbe55, 0x685f8630, 0x3929ce9f, 0x09fbf6fa, + 0x1baff75b, 0x2b7dcf3e, 0x7a0b8791, 0x4ad9bff4, 0xd8e716cf, 0xe8352eaa, + 0xb9436605, 0x89915e60, 0x98d24282, 0xa8007ae7, 0xf9763248, 0xc9a40a2d, + 0x5b9aa316, 0x6b489b73, 0x3a3ed3dc, 0x0aecebb9, 0x1e96d09e, 0x2e44e8fb, + 0x7f32a054, 0x4fe09831, 0xddde310a, 0xed0c096f, 0xbc7a41c0, 0x8ca879a5, + 0x9deb6547, 0xad395d22, 0xfc4f158d, 0xcc9d2de8, 0x5ea384d3, 0x6e71bcb6, + 0x3f07f419, 0x0fd5cc7c, 0x1d81cddd, 0x2d53f5b8, 0x7c25bd17, 0x4cf78572, + 0xdec92c49, 0xee1b142c, 0xbf6d5c83, 0x8fbf64e6, 0x9efc7804, 0xae2e4061, + 0xff5808ce, 0xcf8a30ab, 0x5db49990, 0x6d66a1f5, 0x3c10e95a, 0x0cc2d13f, + 0x14e49f14, 0x2436a771, 0x7540efde, 0x4592d7bb, 0xd7ac7e80, 0xe77e46e5, + 0xb6080e4a, 0x86da362f, 0x97992acd, 0xa74b12a8, 0xf63d5a07, 0xc6ef6262, + 0x54d1cb59, 0x6403f33c, 0x3575bb93, 0x05a783f6, 0x17f38257, 0x2721ba32, + 0x7657f29d, 0x4685caf8, 0xd4bb63c3, 0xe4695ba6, 0xb51f1309, 0x85cd2b6c, + 0x948e378e, 0xa45c0feb, 0xf52a4744, 0xc5f87f21, 0x57c6d61a, 0x6714ee7f, + 0x3662a6d0, 0x06b09eb5, 0x12caa592, 0x22189df7, 0x736ed558, 0x43bced3d, + 0xd1824406, 0xe1507c63, 0xb02634cc, 0x80f40ca9, 0x91b7104b, 0xa165282e, + 0xf0136081, 0xc0c158e4, 0x52fff1df, 0x622dc9ba, 0x335b8115, 0x0389b970, + 0x11ddb8d1, 0x210f80b4, 0x7079c81b, 0x40abf07e, 0xd2955945, 0xe2476120, + 0xb331298f, 0x83e311ea, 0x92a00d08, 0xa272356d, 0xf3047dc2, 0xc3d645a7, + 0x51e8ec9c, 0x613ad4f9, 0x304c9c56, 0x009ea433}; + +const uint32_t kStrideExtensionTable1[256] = { + 0x00000000, 0x54075546, 0xa80eaa8c, 0xfc09ffca, 0x55f123e9, 0x01f676af, + 0xfdff8965, 0xa9f8dc23, 0xabe247d2, 0xffe51294, 0x03eced5e, 0x57ebb818, + 0xfe13643b, 0xaa14317d, 0x561dceb7, 0x021a9bf1, 0x5228f955, 0x062fac13, + 0xfa2653d9, 0xae21069f, 0x07d9dabc, 0x53de8ffa, 0xafd77030, 0xfbd02576, + 0xf9cabe87, 0xadcdebc1, 0x51c4140b, 0x05c3414d, 0xac3b9d6e, 0xf83cc828, + 0x043537e2, 0x503262a4, 0xa451f2aa, 0xf056a7ec, 0x0c5f5826, 0x58580d60, + 0xf1a0d143, 0xa5a78405, 0x59ae7bcf, 0x0da92e89, 0x0fb3b578, 0x5bb4e03e, + 0xa7bd1ff4, 0xf3ba4ab2, 0x5a429691, 0x0e45c3d7, 0xf24c3c1d, 0xa64b695b, + 0xf6790bff, 0xa27e5eb9, 0x5e77a173, 0x0a70f435, 0xa3882816, 0xf78f7d50, + 0x0b86829a, 0x5f81d7dc, 0x5d9b4c2d, 0x099c196b, 0xf595e6a1, 0xa192b3e7, + 0x086a6fc4, 0x5c6d3a82, 0xa064c548, 0xf463900e, 0x4d4f93a5, 0x1948c6e3, + 0xe5413929, 0xb1466c6f, 0x18beb04c, 0x4cb9e50a, 0xb0b01ac0, 0xe4b74f86, + 0xe6add477, 0xb2aa8131, 0x4ea37efb, 0x1aa42bbd, 0xb35cf79e, 0xe75ba2d8, + 0x1b525d12, 0x4f550854, 0x1f676af0, 0x4b603fb6, 0xb769c07c, 0xe36e953a, + 0x4a964919, 0x1e911c5f, 0xe298e395, 0xb69fb6d3, 0xb4852d22, 0xe0827864, + 0x1c8b87ae, 0x488cd2e8, 0xe1740ecb, 0xb5735b8d, 0x497aa447, 0x1d7df101, + 0xe91e610f, 0xbd193449, 0x4110cb83, 0x15179ec5, 0xbcef42e6, 0xe8e817a0, + 0x14e1e86a, 0x40e6bd2c, 0x42fc26dd, 0x16fb739b, 0xeaf28c51, 0xbef5d917, + 0x170d0534, 0x430a5072, 0xbf03afb8, 0xeb04fafe, 0xbb36985a, 0xef31cd1c, + 0x133832d6, 0x473f6790, 0xeec7bbb3, 0xbac0eef5, 0x46c9113f, 0x12ce4479, + 0x10d4df88, 0x44d38ace, 0xb8da7504, 0xecdd2042, 0x4525fc61, 0x1122a927, + 0xed2b56ed, 0xb92c03ab, 0x9a9f274a, 0xce98720c, 0x32918dc6, 0x6696d880, + 0xcf6e04a3, 0x9b6951e5, 0x6760ae2f, 0x3367fb69, 0x317d6098, 0x657a35de, + 0x9973ca14, 0xcd749f52, 0x648c4371, 0x308b1637, 0xcc82e9fd, 0x9885bcbb, + 0xc8b7de1f, 0x9cb08b59, 0x60b97493, 0x34be21d5, 0x9d46fdf6, 0xc941a8b0, + 0x3548577a, 0x614f023c, 0x635599cd, 0x3752cc8b, 0xcb5b3341, 0x9f5c6607, + 0x36a4ba24, 0x62a3ef62, 0x9eaa10a8, 0xcaad45ee, 0x3eced5e0, 0x6ac980a6, + 0x96c07f6c, 0xc2c72a2a, 0x6b3ff609, 0x3f38a34f, 0xc3315c85, 0x973609c3, + 0x952c9232, 0xc12bc774, 0x3d2238be, 0x69256df8, 0xc0ddb1db, 0x94dae49d, + 0x68d31b57, 0x3cd44e11, 0x6ce62cb5, 0x38e179f3, 0xc4e88639, 0x90efd37f, + 0x39170f5c, 0x6d105a1a, 0x9119a5d0, 0xc51ef096, 0xc7046b67, 0x93033e21, + 0x6f0ac1eb, 0x3b0d94ad, 0x92f5488e, 0xc6f21dc8, 0x3afbe202, 0x6efcb744, + 0xd7d0b4ef, 0x83d7e1a9, 0x7fde1e63, 0x2bd94b25, 0x82219706, 0xd626c240, + 0x2a2f3d8a, 0x7e2868cc, 0x7c32f33d, 0x2835a67b, 0xd43c59b1, 0x803b0cf7, + 0x29c3d0d4, 0x7dc48592, 0x81cd7a58, 0xd5ca2f1e, 0x85f84dba, 0xd1ff18fc, + 0x2df6e736, 0x79f1b270, 0xd0096e53, 0x840e3b15, 0x7807c4df, 0x2c009199, + 0x2e1a0a68, 0x7a1d5f2e, 0x8614a0e4, 0xd213f5a2, 0x7beb2981, 0x2fec7cc7, + 0xd3e5830d, 0x87e2d64b, 0x73814645, 0x27861303, 0xdb8fecc9, 0x8f88b98f, + 0x267065ac, 0x727730ea, 0x8e7ecf20, 0xda799a66, 0xd8630197, 0x8c6454d1, + 0x706dab1b, 0x246afe5d, 0x8d92227e, 0xd9957738, 0x259c88f2, 0x719bddb4, + 0x21a9bf10, 0x75aeea56, 0x89a7159c, 0xdda040da, 0x74589cf9, 0x205fc9bf, + 0xdc563675, 0x88516333, 0x8a4bf8c2, 0xde4cad84, 0x2245524e, 0x76420708, + 0xdfbadb2b, 0x8bbd8e6d, 0x77b471a7, 0x23b324e1}; + +const uint32_t kStrideExtensionTable2[256] = { + 0x00000000, 0x678efd01, 0xcf1dfa02, 0xa8930703, 0x9bd782f5, 0xfc597ff4, + 0x54ca78f7, 0x334485f6, 0x3243731b, 0x55cd8e1a, 0xfd5e8919, 0x9ad07418, + 0xa994f1ee, 0xce1a0cef, 0x66890bec, 0x0107f6ed, 0x6486e636, 0x03081b37, + 0xab9b1c34, 0xcc15e135, 0xff5164c3, 0x98df99c2, 0x304c9ec1, 0x57c263c0, + 0x56c5952d, 0x314b682c, 0x99d86f2f, 0xfe56922e, 0xcd1217d8, 0xaa9cead9, + 0x020fedda, 0x658110db, 0xc90dcc6c, 0xae83316d, 0x0610366e, 0x619ecb6f, + 0x52da4e99, 0x3554b398, 0x9dc7b49b, 0xfa49499a, 0xfb4ebf77, 0x9cc04276, + 0x34534575, 0x53ddb874, 0x60993d82, 0x0717c083, 0xaf84c780, 0xc80a3a81, + 0xad8b2a5a, 0xca05d75b, 0x6296d058, 0x05182d59, 0x365ca8af, 0x51d255ae, + 0xf94152ad, 0x9ecfafac, 0x9fc85941, 0xf846a440, 0x50d5a343, 0x375b5e42, + 0x041fdbb4, 0x639126b5, 0xcb0221b6, 0xac8cdcb7, 0x97f7ee29, 0xf0791328, + 0x58ea142b, 0x3f64e92a, 0x0c206cdc, 0x6bae91dd, 0xc33d96de, 0xa4b36bdf, + 0xa5b49d32, 0xc23a6033, 0x6aa96730, 0x0d279a31, 0x3e631fc7, 0x59ede2c6, + 0xf17ee5c5, 0x96f018c4, 0xf371081f, 0x94fff51e, 0x3c6cf21d, 0x5be20f1c, + 0x68a68aea, 0x0f2877eb, 0xa7bb70e8, 0xc0358de9, 0xc1327b04, 0xa6bc8605, + 0x0e2f8106, 0x69a17c07, 0x5ae5f9f1, 0x3d6b04f0, 0x95f803f3, 0xf276fef2, + 0x5efa2245, 0x3974df44, 0x91e7d847, 0xf6692546, 0xc52da0b0, 0xa2a35db1, + 0x0a305ab2, 0x6dbea7b3, 0x6cb9515e, 0x0b37ac5f, 0xa3a4ab5c, 0xc42a565d, + 0xf76ed3ab, 0x90e02eaa, 0x387329a9, 0x5ffdd4a8, 0x3a7cc473, 0x5df23972, + 0xf5613e71, 0x92efc370, 0xa1ab4686, 0xc625bb87, 0x6eb6bc84, 0x09384185, + 0x083fb768, 0x6fb14a69, 0xc7224d6a, 0xa0acb06b, 0x93e8359d, 0xf466c89c, + 0x5cf5cf9f, 0x3b7b329e, 0x2a03aaa3, 0x4d8d57a2, 0xe51e50a1, 0x8290ada0, + 0xb1d42856, 0xd65ad557, 0x7ec9d254, 0x19472f55, 0x1840d9b8, 0x7fce24b9, + 0xd75d23ba, 0xb0d3debb, 0x83975b4d, 0xe419a64c, 0x4c8aa14f, 0x2b045c4e, + 0x4e854c95, 0x290bb194, 0x8198b697, 0xe6164b96, 0xd552ce60, 0xb2dc3361, + 0x1a4f3462, 0x7dc1c963, 0x7cc63f8e, 0x1b48c28f, 0xb3dbc58c, 0xd455388d, + 0xe711bd7b, 0x809f407a, 0x280c4779, 0x4f82ba78, 0xe30e66cf, 0x84809bce, + 0x2c139ccd, 0x4b9d61cc, 0x78d9e43a, 0x1f57193b, 0xb7c41e38, 0xd04ae339, + 0xd14d15d4, 0xb6c3e8d5, 0x1e50efd6, 0x79de12d7, 0x4a9a9721, 0x2d146a20, + 0x85876d23, 0xe2099022, 0x878880f9, 0xe0067df8, 0x48957afb, 0x2f1b87fa, + 0x1c5f020c, 0x7bd1ff0d, 0xd342f80e, 0xb4cc050f, 0xb5cbf3e2, 0xd2450ee3, + 0x7ad609e0, 0x1d58f4e1, 0x2e1c7117, 0x49928c16, 0xe1018b15, 0x868f7614, + 0xbdf4448a, 0xda7ab98b, 0x72e9be88, 0x15674389, 0x2623c67f, 0x41ad3b7e, + 0xe93e3c7d, 0x8eb0c17c, 0x8fb73791, 0xe839ca90, 0x40aacd93, 0x27243092, + 0x1460b564, 0x73ee4865, 0xdb7d4f66, 0xbcf3b267, 0xd972a2bc, 0xbefc5fbd, + 0x166f58be, 0x71e1a5bf, 0x42a52049, 0x252bdd48, 0x8db8da4b, 0xea36274a, + 0xeb31d1a7, 0x8cbf2ca6, 0x242c2ba5, 0x43a2d6a4, 0x70e65352, 0x1768ae53, + 0xbffba950, 0xd8755451, 0x74f988e6, 0x137775e7, 0xbbe472e4, 0xdc6a8fe5, + 0xef2e0a13, 0x88a0f712, 0x2033f011, 0x47bd0d10, 0x46bafbfd, 0x213406fc, + 0x89a701ff, 0xee29fcfe, 0xdd6d7908, 0xbae38409, 0x1270830a, 0x75fe7e0b, + 0x107f6ed0, 0x77f193d1, 0xdf6294d2, 0xb8ec69d3, 0x8ba8ec25, 0xec261124, + 0x44b51627, 0x233beb26, 0x223c1dcb, 0x45b2e0ca, 0xed21e7c9, 0x8aaf1ac8, + 0xb9eb9f3e, 0xde65623f, 0x76f6653c, 0x1178983d}; + +const uint32_t kStrideExtensionTable3[256] = { + 0x00000000, 0xf20c0dfe, 0xe1f46d0d, 0x13f860f3, 0xc604aceb, 0x3408a115, + 0x27f0c1e6, 0xd5fccc18, 0x89e52f27, 0x7be922d9, 0x6811422a, 0x9a1d4fd4, + 0x4fe183cc, 0xbded8e32, 0xae15eec1, 0x5c19e33f, 0x162628bf, 0xe42a2541, + 0xf7d245b2, 0x05de484c, 0xd0228454, 0x222e89aa, 0x31d6e959, 0xc3dae4a7, + 0x9fc30798, 0x6dcf0a66, 0x7e376a95, 0x8c3b676b, 0x59c7ab73, 0xabcba68d, + 0xb833c67e, 0x4a3fcb80, 0x2c4c517e, 0xde405c80, 0xcdb83c73, 0x3fb4318d, + 0xea48fd95, 0x1844f06b, 0x0bbc9098, 0xf9b09d66, 0xa5a97e59, 0x57a573a7, + 0x445d1354, 0xb6511eaa, 0x63add2b2, 0x91a1df4c, 0x8259bfbf, 0x7055b241, + 0x3a6a79c1, 0xc866743f, 0xdb9e14cc, 0x29921932, 0xfc6ed52a, 0x0e62d8d4, + 0x1d9ab827, 0xef96b5d9, 0xb38f56e6, 0x41835b18, 0x527b3beb, 0xa0773615, + 0x758bfa0d, 0x8787f7f3, 0x947f9700, 0x66739afe, 0x5898a2fc, 0xaa94af02, + 0xb96ccff1, 0x4b60c20f, 0x9e9c0e17, 0x6c9003e9, 0x7f68631a, 0x8d646ee4, + 0xd17d8ddb, 0x23718025, 0x3089e0d6, 0xc285ed28, 0x17792130, 0xe5752cce, + 0xf68d4c3d, 0x048141c3, 0x4ebe8a43, 0xbcb287bd, 0xaf4ae74e, 0x5d46eab0, + 0x88ba26a8, 0x7ab62b56, 0x694e4ba5, 0x9b42465b, 0xc75ba564, 0x3557a89a, + 0x26afc869, 0xd4a3c597, 0x015f098f, 0xf3530471, 0xe0ab6482, 0x12a7697c, + 0x74d4f382, 0x86d8fe7c, 0x95209e8f, 0x672c9371, 0xb2d05f69, 0x40dc5297, + 0x53243264, 0xa1283f9a, 0xfd31dca5, 0x0f3dd15b, 0x1cc5b1a8, 0xeec9bc56, + 0x3b35704e, 0xc9397db0, 0xdac11d43, 0x28cd10bd, 0x62f2db3d, 0x90fed6c3, + 0x8306b630, 0x710abbce, 0xa4f677d6, 0x56fa7a28, 0x45021adb, 0xb70e1725, + 0xeb17f41a, 0x191bf9e4, 0x0ae39917, 0xf8ef94e9, 0x2d1358f1, 0xdf1f550f, + 0xcce735fc, 0x3eeb3802, 0xb13145f8, 0x433d4806, 0x50c528f5, 0xa2c9250b, + 0x7735e913, 0x8539e4ed, 0x96c1841e, 0x64cd89e0, 0x38d46adf, 0xcad86721, + 0xd92007d2, 0x2b2c0a2c, 0xfed0c634, 0x0cdccbca, 0x1f24ab39, 0xed28a6c7, + 0xa7176d47, 0x551b60b9, 0x46e3004a, 0xb4ef0db4, 0x6113c1ac, 0x931fcc52, + 0x80e7aca1, 0x72eba15f, 0x2ef24260, 0xdcfe4f9e, 0xcf062f6d, 0x3d0a2293, + 0xe8f6ee8b, 0x1afae375, 0x09028386, 0xfb0e8e78, 0x9d7d1486, 0x6f711978, + 0x7c89798b, 0x8e857475, 0x5b79b86d, 0xa975b593, 0xba8dd560, 0x4881d89e, + 0x14983ba1, 0xe694365f, 0xf56c56ac, 0x07605b52, 0xd29c974a, 0x20909ab4, + 0x3368fa47, 0xc164f7b9, 0x8b5b3c39, 0x795731c7, 0x6aaf5134, 0x98a35cca, + 0x4d5f90d2, 0xbf539d2c, 0xacabfddf, 0x5ea7f021, 0x02be131e, 0xf0b21ee0, + 0xe34a7e13, 0x114673ed, 0xc4babff5, 0x36b6b20b, 0x254ed2f8, 0xd742df06, + 0xe9a9e704, 0x1ba5eafa, 0x085d8a09, 0xfa5187f7, 0x2fad4bef, 0xdda14611, + 0xce5926e2, 0x3c552b1c, 0x604cc823, 0x9240c5dd, 0x81b8a52e, 0x73b4a8d0, + 0xa64864c8, 0x54446936, 0x47bc09c5, 0xb5b0043b, 0xff8fcfbb, 0x0d83c245, + 0x1e7ba2b6, 0xec77af48, 0x398b6350, 0xcb876eae, 0xd87f0e5d, 0x2a7303a3, + 0x766ae09c, 0x8466ed62, 0x979e8d91, 0x6592806f, 0xb06e4c77, 0x42624189, + 0x519a217a, 0xa3962c84, 0xc5e5b67a, 0x37e9bb84, 0x2411db77, 0xd61dd689, + 0x03e11a91, 0xf1ed176f, 0xe215779c, 0x10197a62, 0x4c00995d, 0xbe0c94a3, + 0xadf4f450, 0x5ff8f9ae, 0x8a0435b6, 0x78083848, 0x6bf058bb, 0x99fc5545, + 0xd3c39ec5, 0x21cf933b, 0x3237f3c8, 0xc03bfe36, 0x15c7322e, 0xe7cb3fd0, + 0xf4335f23, 0x063f52dd, 0x5a26b1e2, 0xa82abc1c, 0xbbd2dcef, 0x49ded111, + 0x9c221d09, 0x6e2e10f7, 0x7dd67004, 0x8fda7dfa}; + +constexpr const ptrdiff_t kPrefetchHorizon = 256; + +} // namespace + +namespace crc32c { + +uint32_t ExtendPortable(uint32_t crc, const uint8_t* data, size_t size) { + const uint8_t* p = data; + const uint8_t* e = p + size; + uint32_t l = crc ^ kCRC32Xor; + +// Process one byte at a time. +#define STEP1 \ + do { \ + int c = (l & 0xff) ^ *p++; \ + l = kByteExtensionTable[c] ^ (l >> 8); \ + } while (0) + +// Process one of the 4 strides of 4-byte data. +#define STEP4(s) \ + do { \ + crc##s = ReadUint32LE(p + s * 4) ^ kStrideExtensionTable3[crc##s & 0xff] ^ \ + kStrideExtensionTable2[(crc##s >> 8) & 0xff] ^ \ + kStrideExtensionTable1[(crc##s >> 16) & 0xff] ^ \ + kStrideExtensionTable0[crc##s >> 24]; \ + } while (0) + +// Process a 16-byte swath of 4 strides, each of which has 4 bytes of data. +#define STEP16 \ + do { \ + STEP4(0); \ + STEP4(1); \ + STEP4(2); \ + STEP4(3); \ + p += 16; \ + } while (0) + +// Process 4 bytes that were already loaded into a word. +#define STEP4W(w) \ + do { \ + w ^= l; \ + for (size_t i = 0; i < 4; ++i) { \ + w = (w >> 8) ^ kByteExtensionTable[w & 0xff]; \ + } \ + l = w; \ + } while (0) + + // Point x at first 4-byte aligned byte in the buffer. This might be past the + // end of the buffer. + const uint8_t* x = RoundUp<4>(p); + if (x <= e) { + // Process bytes p is 4-byte aligned. + while (p != x) { + STEP1; + } + } + + if ((e - p) >= 16) { + // Load a 16-byte swath into the stride partial results. + uint32_t crc0 = ReadUint32LE(p + 0 * 4) ^ l; + uint32_t crc1 = ReadUint32LE(p + 1 * 4); + uint32_t crc2 = ReadUint32LE(p + 2 * 4); + uint32_t crc3 = ReadUint32LE(p + 3 * 4); + p += 16; + + while ((e - p) > kPrefetchHorizon) { + RequestPrefetch(p + kPrefetchHorizon); + + // Process 64 bytes at a time. + STEP16; + STEP16; + STEP16; + STEP16; + } + + // Process one 16-byte swath at a time. + while ((e - p) >= 16) { + STEP16; + } + + // Advance one word at a time as far as possible. + while ((e - p) >= 4) { + STEP4(0); + uint32_t tmp = crc0; + crc0 = crc1; + crc1 = crc2; + crc2 = crc3; + crc3 = tmp; + p += 4; + } + + // Combine the 4 partial stride results. + l = 0; + STEP4W(crc0); + STEP4W(crc1); + STEP4W(crc2); + STEP4W(crc3); + } + + // Process the last few bytes. + while (p != e) { + STEP1; + } +#undef STEP4W +#undef STEP16 +#undef STEP4 +#undef STEP1 + return l ^ kCRC32Xor; +} + +} // namespace crc32c diff --git a/third_party/crc32c/src/src/crc32c_portable_unittest.cc b/third_party/crc32c/src/src/crc32c_portable_unittest.cc new file mode 100644 index 0000000000..5098e2c373 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_portable_unittest.cc @@ -0,0 +1,20 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "gtest/gtest.h" + +#include "./crc32c_extend_unittests.h" +#include "./crc32c_internal.h" + +namespace crc32c { + +struct PortableTestTraits { + static uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) { + return ExtendPortable(crc, data, count); + } +}; + +INSTANTIATE_TYPED_TEST_SUITE_P(Portable, ExtendTest, PortableTestTraits); + +} // namespace crc32c diff --git a/third_party/crc32c/src/src/crc32c_prefetch.h b/third_party/crc32c/src/src/crc32c_prefetch.h new file mode 100644 index 0000000000..e8df540494 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_prefetch.h @@ -0,0 +1,44 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef CRC32C_CRC32C_PREFETCH_H_ +#define CRC32C_CRC32C_PREFETCH_H_ + +#include <cstddef> +#include <cstdint> + +#include "crc32c/crc32c_config.h" + +#if HAVE_MM_PREFETCH + +#if defined(_MSC_VER) +#include <intrin.h> +#else // !defined(_MSC_VER) +#include <xmmintrin.h> +#endif // defined(_MSC_VER) + +#endif // HAVE_MM_PREFETCH + +namespace crc32c { + +// Ask the hardware to prefetch the data at the given address into the L1 cache. +inline void RequestPrefetch(const uint8_t* address) { +#if HAVE_BUILTIN_PREFETCH + // Clang and GCC implement the __builtin_prefetch non-standard extension, + // which maps to the best instruction on the target architecture. + __builtin_prefetch(reinterpret_cast<const char*>(address), 0 /* Read only. */, + 0 /* No temporal locality. */); +#elif HAVE_MM_PREFETCH + // Visual Studio doesn't implement __builtin_prefetch, but exposes the + // PREFETCHNTA instruction via the _mm_prefetch intrinsic. + _mm_prefetch(reinterpret_cast<const char*>(address), _MM_HINT_NTA); +#else + // No prefetch support. Silence compiler warnings. + (void)address; +#endif // HAVE_BUILTIN_PREFETCH +} + +} // namespace crc32c + +#endif // CRC32C_CRC32C_ROUND_UP_H_ diff --git a/third_party/crc32c/src/src/crc32c_prefetch_unittest.cc b/third_party/crc32c/src/src/crc32c_prefetch_unittest.cc new file mode 100644 index 0000000000..b34ed2d5fe --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_prefetch_unittest.cc @@ -0,0 +1,9 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "./crc32c_prefetch.h" + +// There is no easy way to test cache prefetching. We can only test that the +// crc32c_prefetch.h header compiles on its own, so it doesn't have any unstated +// dependencies. diff --git a/third_party/crc32c/src/src/crc32c_read_le.h b/third_party/crc32c/src/src/crc32c_read_le.h new file mode 100644 index 0000000000..fe455623c2 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_read_le.h @@ -0,0 +1,51 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef CRC32C_CRC32C_READ_LE_H_ +#define CRC32C_CRC32C_READ_LE_H_ + +#include <cstdint> +#include <cstring> + +#include "crc32c/crc32c_config.h" + +namespace crc32c { + +// Reads a little-endian 32-bit integer from a 32-bit-aligned buffer. +inline uint32_t ReadUint32LE(const uint8_t* buffer) { +#if BYTE_ORDER_BIG_ENDIAN + return ((static_cast<uint32_t>(static_cast<uint8_t>(buffer[0]))) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[1])) << 8) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[2])) << 16) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[3])) << 24)); +#else // !BYTE_ORDER_BIG_ENDIAN + uint32_t result; + // This should be optimized to a single instruction. + std::memcpy(&result, buffer, sizeof(result)); + return result; +#endif // BYTE_ORDER_BIG_ENDIAN +} + +// Reads a little-endian 64-bit integer from a 64-bit-aligned buffer. +inline uint64_t ReadUint64LE(const uint8_t* buffer) { +#if BYTE_ORDER_BIG_ENDIAN + return ((static_cast<uint32_t>(static_cast<uint8_t>(buffer[0]))) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[1])) << 8) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[2])) << 16) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[3])) << 24) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[4])) << 32) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[5])) << 40) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[6])) << 48) | + (static_cast<uint32_t>(static_cast<uint8_t>(buffer[7])) << 56)); +#else // !BYTE_ORDER_BIG_ENDIAN + uint64_t result; + // This should be optimized to a single instruction. + std::memcpy(&result, buffer, sizeof(result)); + return result; +#endif // BYTE_ORDER_BIG_ENDIAN +} + +} // namespace crc32c + +#endif // CRC32C_CRC32C_READ_LE_H_ diff --git a/third_party/crc32c/src/src/crc32c_read_le_unittest.cc b/third_party/crc32c/src/src/crc32c_read_le_unittest.cc new file mode 100644 index 0000000000..2a30302adf --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_read_le_unittest.cc @@ -0,0 +1,32 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "./crc32c_read_le.h" + +#include <cstddef> +#include <cstdint> + +#include "gtest/gtest.h" + +#include "./crc32c_round_up.h" + +namespace crc32c { + +TEST(Crc32CReadLETest, ReadUint32LE) { + // little-endian 0x12345678 + alignas(4) uint8_t bytes[] = {0x78, 0x56, 0x34, 0x12}; + + ASSERT_EQ(RoundUp<4>(bytes), bytes) << "Stack array is not aligned"; + EXPECT_EQ(static_cast<uint32_t>(0x12345678), ReadUint32LE(bytes)); +} + +TEST(Crc32CReadLETest, ReadUint64LE) { + // little-endian 0x123456789ABCDEF0 + alignas(8) uint8_t bytes[] = {0xF0, 0xDE, 0xBC, 0x9A, 0x78, 0x56, 0x34, 0x12}; + + ASSERT_EQ(RoundUp<8>(bytes), bytes) << "Stack array is not aligned"; + EXPECT_EQ(static_cast<uint64_t>(0x123456789ABCDEF0), ReadUint64LE(bytes)); +} + +} // namespace crc32c diff --git a/third_party/crc32c/src/src/crc32c_round_up.h b/third_party/crc32c/src/src/crc32c_round_up.h new file mode 100644 index 0000000000..d3b922beb9 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_round_up.h @@ -0,0 +1,34 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef CRC32C_CRC32C_ROUND_UP_H_ +#define CRC32C_CRC32C_ROUND_UP_H_ + +#include <cstddef> +#include <cstdint> + +namespace crc32c { + +// Returns the smallest number >= the given number that is evenly divided by N. +// +// N must be a power of two. +template <int N> +constexpr inline uintptr_t RoundUp(uintptr_t pointer) { + static_assert((N & (N - 1)) == 0, "N must be a power of two"); + return (pointer + (N - 1)) & ~(N - 1); +} + +// Returns the smallest address >= the given address that is aligned to N bytes. +// +// N must be a power of two. +template <int N> +constexpr inline const uint8_t* RoundUp(const uint8_t* pointer) { + static_assert((N & (N - 1)) == 0, "N must be a power of two"); + return reinterpret_cast<uint8_t*>( + RoundUp<N>(reinterpret_cast<uintptr_t>(pointer))); +} + +} // namespace crc32c + +#endif // CRC32C_CRC32C_ROUND_UP_H_ diff --git a/third_party/crc32c/src/src/crc32c_round_up_unittest.cc b/third_party/crc32c/src/src/crc32c_round_up_unittest.cc new file mode 100644 index 0000000000..5ff657bb5c --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_round_up_unittest.cc @@ -0,0 +1,84 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "./crc32c_round_up.h" + +#include <cstddef> +#include <cstdint> + +#include "gtest/gtest.h" + +namespace crc32c { + +TEST(CRC32CRoundUpTest, RoundUpUintptr) { + uintptr_t zero = 0; + + ASSERT_EQ(zero, RoundUp<1>(zero)); + ASSERT_EQ(1U, RoundUp<1>(1U)); + ASSERT_EQ(2U, RoundUp<1>(2U)); + ASSERT_EQ(3U, RoundUp<1>(3U)); + ASSERT_EQ(~static_cast<uintptr_t>(0), RoundUp<1>(~static_cast<uintptr_t>(0))); + ASSERT_EQ(~static_cast<uintptr_t>(1), RoundUp<1>(~static_cast<uintptr_t>(1))); + ASSERT_EQ(~static_cast<uintptr_t>(2), RoundUp<1>(~static_cast<uintptr_t>(2))); + ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<1>(~static_cast<uintptr_t>(3))); + + ASSERT_EQ(zero, RoundUp<2>(zero)); + ASSERT_EQ(2U, RoundUp<2>(1U)); + ASSERT_EQ(2U, RoundUp<2>(2U)); + ASSERT_EQ(4U, RoundUp<2>(3U)); + ASSERT_EQ(4U, RoundUp<2>(4U)); + ASSERT_EQ(6U, RoundUp<2>(5U)); + ASSERT_EQ(6U, RoundUp<2>(6U)); + ASSERT_EQ(8U, RoundUp<2>(7U)); + ASSERT_EQ(8U, RoundUp<2>(8U)); + ASSERT_EQ(~static_cast<uintptr_t>(1), RoundUp<2>(~static_cast<uintptr_t>(1))); + ASSERT_EQ(~static_cast<uintptr_t>(1), RoundUp<2>(~static_cast<uintptr_t>(2))); + ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<2>(~static_cast<uintptr_t>(3))); + ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<2>(~static_cast<uintptr_t>(4))); + + ASSERT_EQ(zero, RoundUp<4>(zero)); + ASSERT_EQ(4U, RoundUp<4>(1U)); + ASSERT_EQ(4U, RoundUp<4>(2U)); + ASSERT_EQ(4U, RoundUp<4>(3U)); + ASSERT_EQ(4U, RoundUp<4>(4U)); + ASSERT_EQ(8U, RoundUp<4>(5U)); + ASSERT_EQ(8U, RoundUp<4>(6U)); + ASSERT_EQ(8U, RoundUp<4>(7U)); + ASSERT_EQ(8U, RoundUp<4>(8U)); + ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<4>(~static_cast<uintptr_t>(3))); + ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<4>(~static_cast<uintptr_t>(4))); + ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<4>(~static_cast<uintptr_t>(5))); + ASSERT_EQ(~static_cast<uintptr_t>(3), RoundUp<4>(~static_cast<uintptr_t>(6))); + ASSERT_EQ(~static_cast<uintptr_t>(7), RoundUp<4>(~static_cast<uintptr_t>(7))); + ASSERT_EQ(~static_cast<uintptr_t>(7), RoundUp<4>(~static_cast<uintptr_t>(8))); + ASSERT_EQ(~static_cast<uintptr_t>(7), RoundUp<4>(~static_cast<uintptr_t>(9))); +} + +TEST(CRC32CRoundUpTest, RoundUpPointer) { + uintptr_t zero = 0, three = 3, four = 4, seven = 7, eight = 8; + + const uint8_t* zero_ptr = reinterpret_cast<const uint8_t*>(zero); + const uint8_t* three_ptr = reinterpret_cast<const uint8_t*>(three); + const uint8_t* four_ptr = reinterpret_cast<const uint8_t*>(four); + const uint8_t* seven_ptr = reinterpret_cast<const uint8_t*>(seven); + const uint8_t* eight_ptr = reinterpret_cast<uint8_t*>(eight); + + ASSERT_EQ(zero_ptr, RoundUp<1>(zero_ptr)); + ASSERT_EQ(zero_ptr, RoundUp<4>(zero_ptr)); + ASSERT_EQ(zero_ptr, RoundUp<8>(zero_ptr)); + + ASSERT_EQ(three_ptr, RoundUp<1>(three_ptr)); + ASSERT_EQ(four_ptr, RoundUp<4>(three_ptr)); + ASSERT_EQ(eight_ptr, RoundUp<8>(three_ptr)); + + ASSERT_EQ(four_ptr, RoundUp<1>(four_ptr)); + ASSERT_EQ(four_ptr, RoundUp<4>(four_ptr)); + ASSERT_EQ(eight_ptr, RoundUp<8>(four_ptr)); + + ASSERT_EQ(seven_ptr, RoundUp<1>(seven_ptr)); + ASSERT_EQ(eight_ptr, RoundUp<4>(seven_ptr)); + ASSERT_EQ(eight_ptr, RoundUp<8>(four_ptr)); +} + +} // namespace crc32c diff --git a/third_party/crc32c/src/src/crc32c_sse42.cc b/third_party/crc32c/src/src/crc32c_sse42.cc new file mode 100644 index 0000000000..fc0cb0725f --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_sse42.cc @@ -0,0 +1,256 @@ +// Copyright 2008 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "./crc32c_sse42.h" + +// In a separate source file to allow this accelerated CRC32C function to be +// compiled with the appropriate compiler flags to enable SSE4.2 instructions. + +// This implementation is loosely based on Intel Pub 323405 from April 2011, +// "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction". + +#include <cstddef> +#include <cstdint> + +#include "./crc32c_internal.h" +#include "./crc32c_prefetch.h" +#include "./crc32c_read_le.h" +#include "./crc32c_round_up.h" +#include "crc32c/crc32c_config.h" + +#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +#if defined(_MSC_VER) +#include <intrin.h> +#else // !defined(_MSC_VER) +#include <nmmintrin.h> +#endif // defined(_MSC_VER) + +namespace crc32c { + +namespace { + +constexpr const ptrdiff_t kGroups = 3; +constexpr const ptrdiff_t kBlock0Size = 16 * 1024 / kGroups / 64 * 64; +constexpr const ptrdiff_t kBlock1Size = 4 * 1024 / kGroups / 8 * 8; +constexpr const ptrdiff_t kBlock2Size = 1024 / kGroups / 8 * 8; + +const uint32_t kBlock0SkipTable[8][16] = { + {0x00000000, 0xff770459, 0xfb027e43, 0x04757a1a, 0xf3e88a77, 0x0c9f8e2e, + 0x08eaf434, 0xf79df06d, 0xe23d621f, 0x1d4a6646, 0x193f1c5c, 0xe6481805, + 0x11d5e868, 0xeea2ec31, 0xead7962b, 0x15a09272}, + {0x00000000, 0xc196b2cf, 0x86c1136f, 0x4757a1a0, 0x086e502f, 0xc9f8e2e0, + 0x8eaf4340, 0x4f39f18f, 0x10dca05e, 0xd14a1291, 0x961db331, 0x578b01fe, + 0x18b2f071, 0xd92442be, 0x9e73e31e, 0x5fe551d1}, + {0x00000000, 0x21b940bc, 0x43728178, 0x62cbc1c4, 0x86e502f0, 0xa75c424c, + 0xc5978388, 0xe42ec334, 0x08267311, 0x299f33ad, 0x4b54f269, 0x6aedb2d5, + 0x8ec371e1, 0xaf7a315d, 0xcdb1f099, 0xec08b025}, + {0x00000000, 0x104ce622, 0x2099cc44, 0x30d52a66, 0x41339888, 0x517f7eaa, + 0x61aa54cc, 0x71e6b2ee, 0x82673110, 0x922bd732, 0xa2fefd54, 0xb2b21b76, + 0xc354a998, 0xd3184fba, 0xe3cd65dc, 0xf38183fe}, + {0x00000000, 0x012214d1, 0x024429a2, 0x03663d73, 0x04885344, 0x05aa4795, + 0x06cc7ae6, 0x07ee6e37, 0x0910a688, 0x0832b259, 0x0b548f2a, 0x0a769bfb, + 0x0d98f5cc, 0x0cbae11d, 0x0fdcdc6e, 0x0efec8bf}, + {0x00000000, 0x12214d10, 0x24429a20, 0x3663d730, 0x48853440, 0x5aa47950, + 0x6cc7ae60, 0x7ee6e370, 0x910a6880, 0x832b2590, 0xb548f2a0, 0xa769bfb0, + 0xd98f5cc0, 0xcbae11d0, 0xfdcdc6e0, 0xefec8bf0}, + {0x00000000, 0x27f8a7f1, 0x4ff14fe2, 0x6809e813, 0x9fe29fc4, 0xb81a3835, + 0xd013d026, 0xf7eb77d7, 0x3a294979, 0x1dd1ee88, 0x75d8069b, 0x5220a16a, + 0xa5cbd6bd, 0x8233714c, 0xea3a995f, 0xcdc23eae}, + {0x00000000, 0x745292f2, 0xe8a525e4, 0x9cf7b716, 0xd4a63d39, 0xa0f4afcb, + 0x3c0318dd, 0x48518a2f, 0xaca00c83, 0xd8f29e71, 0x44052967, 0x3057bb95, + 0x780631ba, 0x0c54a348, 0x90a3145e, 0xe4f186ac}, +}; +const uint32_t kBlock1SkipTable[8][16] = { + {0x00000000, 0x79113270, 0xf22264e0, 0x8b335690, 0xe1a8bf31, 0x98b98d41, + 0x138adbd1, 0x6a9be9a1, 0xc6bd0893, 0xbfac3ae3, 0x349f6c73, 0x4d8e5e03, + 0x2715b7a2, 0x5e0485d2, 0xd537d342, 0xac26e132}, + {0x00000000, 0x889667d7, 0x14c0b95f, 0x9c56de88, 0x298172be, 0xa1171569, + 0x3d41cbe1, 0xb5d7ac36, 0x5302e57c, 0xdb9482ab, 0x47c25c23, 0xcf543bf4, + 0x7a8397c2, 0xf215f015, 0x6e432e9d, 0xe6d5494a}, + {0x00000000, 0xa605caf8, 0x49e7e301, 0xefe229f9, 0x93cfc602, 0x35ca0cfa, + 0xda282503, 0x7c2deffb, 0x2273faf5, 0x8476300d, 0x6b9419f4, 0xcd91d30c, + 0xb1bc3cf7, 0x17b9f60f, 0xf85bdff6, 0x5e5e150e}, + {0x00000000, 0x44e7f5ea, 0x89cfebd4, 0xcd281e3e, 0x1673a159, 0x529454b3, + 0x9fbc4a8d, 0xdb5bbf67, 0x2ce742b2, 0x6800b758, 0xa528a966, 0xe1cf5c8c, + 0x3a94e3eb, 0x7e731601, 0xb35b083f, 0xf7bcfdd5}, + {0x00000000, 0x59ce8564, 0xb39d0ac8, 0xea538fac, 0x62d66361, 0x3b18e605, + 0xd14b69a9, 0x8885eccd, 0xc5acc6c2, 0x9c6243a6, 0x7631cc0a, 0x2fff496e, + 0xa77aa5a3, 0xfeb420c7, 0x14e7af6b, 0x4d292a0f}, + {0x00000000, 0x8eb5fb75, 0x1887801b, 0x96327b6e, 0x310f0036, 0xbfbafb43, + 0x2988802d, 0xa73d7b58, 0x621e006c, 0xecabfb19, 0x7a998077, 0xf42c7b02, + 0x5311005a, 0xdda4fb2f, 0x4b968041, 0xc5237b34}, + {0x00000000, 0xc43c00d8, 0x8d947741, 0x49a87799, 0x1ec49873, 0xdaf898ab, + 0x9350ef32, 0x576cefea, 0x3d8930e6, 0xf9b5303e, 0xb01d47a7, 0x7421477f, + 0x234da895, 0xe771a84d, 0xaed9dfd4, 0x6ae5df0c}, + {0x00000000, 0x7b1261cc, 0xf624c398, 0x8d36a254, 0xe9a5f1c1, 0x92b7900d, + 0x1f813259, 0x64935395, 0xd6a79573, 0xadb5f4bf, 0x208356eb, 0x5b913727, + 0x3f0264b2, 0x4410057e, 0xc926a72a, 0xb234c6e6}, +}; +const uint32_t kBlock2SkipTable[8][16] = { + {0x00000000, 0x8f158014, 0x1bc776d9, 0x94d2f6cd, 0x378eedb2, 0xb89b6da6, + 0x2c499b6b, 0xa35c1b7f, 0x6f1ddb64, 0xe0085b70, 0x74daadbd, 0xfbcf2da9, + 0x589336d6, 0xd786b6c2, 0x4354400f, 0xcc41c01b}, + {0x00000000, 0xde3bb6c8, 0xb99b1b61, 0x67a0ada9, 0x76da4033, 0xa8e1f6fb, + 0xcf415b52, 0x117aed9a, 0xedb48066, 0x338f36ae, 0x542f9b07, 0x8a142dcf, + 0x9b6ec055, 0x4555769d, 0x22f5db34, 0xfcce6dfc}, + {0x00000000, 0xde85763d, 0xb8e69a8b, 0x6663ecb6, 0x742143e7, 0xaaa435da, + 0xccc7d96c, 0x1242af51, 0xe84287ce, 0x36c7f1f3, 0x50a41d45, 0x8e216b78, + 0x9c63c429, 0x42e6b214, 0x24855ea2, 0xfa00289f}, + {0x00000000, 0xd569796d, 0xaf3e842b, 0x7a57fd46, 0x5b917ea7, 0x8ef807ca, + 0xf4affa8c, 0x21c683e1, 0xb722fd4e, 0x624b8423, 0x181c7965, 0xcd750008, + 0xecb383e9, 0x39dafa84, 0x438d07c2, 0x96e47eaf}, + {0x00000000, 0x6ba98c6d, 0xd75318da, 0xbcfa94b7, 0xab4a4745, 0xc0e3cb28, + 0x7c195f9f, 0x17b0d3f2, 0x5378f87b, 0x38d17416, 0x842be0a1, 0xef826ccc, + 0xf832bf3e, 0x939b3353, 0x2f61a7e4, 0x44c82b89}, + {0x00000000, 0xa6f1f0f6, 0x480f971d, 0xeefe67eb, 0x901f2e3a, 0x36eedecc, + 0xd810b927, 0x7ee149d1, 0x25d22a85, 0x8323da73, 0x6dddbd98, 0xcb2c4d6e, + 0xb5cd04bf, 0x133cf449, 0xfdc293a2, 0x5b336354}, + {0x00000000, 0x4ba4550a, 0x9748aa14, 0xdcecff1e, 0x2b7d22d9, 0x60d977d3, + 0xbc3588cd, 0xf791ddc7, 0x56fa45b2, 0x1d5e10b8, 0xc1b2efa6, 0x8a16baac, + 0x7d87676b, 0x36233261, 0xeacfcd7f, 0xa16b9875}, + {0x00000000, 0xadf48b64, 0x5e056039, 0xf3f1eb5d, 0xbc0ac072, 0x11fe4b16, + 0xe20fa04b, 0x4ffb2b2f, 0x7df9f615, 0xd00d7d71, 0x23fc962c, 0x8e081d48, + 0xc1f33667, 0x6c07bd03, 0x9ff6565e, 0x3202dd3a}, +}; + +constexpr const ptrdiff_t kPrefetchHorizon = 256; + +} // namespace + +uint32_t ExtendSse42(uint32_t crc, const uint8_t* data, size_t size) { + const uint8_t* p = data; + const uint8_t* e = data + size; + uint32_t l = crc ^ kCRC32Xor; + +#define STEP1 \ + do { \ + l = _mm_crc32_u8(l, *p++); \ + } while (0) + +#define STEP4(crc) \ + do { \ + crc = _mm_crc32_u32(crc, ReadUint32LE(p)); \ + p += 4; \ + } while (0) + +#define STEP8(crc, data) \ + do { \ + crc = _mm_crc32_u64(crc, ReadUint64LE(data)); \ + data += 8; \ + } while (0) + +#define STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \ + do { \ + STEP8(crc0, p0); \ + STEP8(crc1, p1); \ + STEP8(crc2, p2); \ + } while (0) + +#define STEP8X3(crc0, crc1, crc2, bs) \ + do { \ + crc0 = _mm_crc32_u64(crc0, ReadUint64LE(p)); \ + crc1 = _mm_crc32_u64(crc1, ReadUint64LE(p + bs)); \ + crc2 = _mm_crc32_u64(crc2, ReadUint64LE(p + 2 * bs)); \ + p += 8; \ + } while (0) + +#define SKIP_BLOCK(crc, tab) \ + do { \ + crc = tab[0][crc & 0xf] ^ tab[1][(crc >> 4) & 0xf] ^ \ + tab[2][(crc >> 8) & 0xf] ^ tab[3][(crc >> 12) & 0xf] ^ \ + tab[4][(crc >> 16) & 0xf] ^ tab[5][(crc >> 20) & 0xf] ^ \ + tab[6][(crc >> 24) & 0xf] ^ tab[7][(crc >> 28) & 0xf]; \ + } while (0) + + // Point x at first 8-byte aligned byte in the buffer. This might be past the + // end of the buffer. + const uint8_t* x = RoundUp<8>(p); + if (x <= e) { + // Process bytes p is 8-byte aligned. + while (p != x) { + STEP1; + } + } + + // Proccess the data in predetermined block sizes with tables for quickly + // combining the checksum. Experimentally it's better to use larger block + // sizes where possible so use a hierarchy of decreasing block sizes. + uint64_t l64 = l; + while ((e - p) >= kGroups * kBlock0Size) { + uint64_t l641 = 0; + uint64_t l642 = 0; + for (int i = 0; i < kBlock0Size; i += 8 * 8) { + // Prefetch ahead to hide latency. + RequestPrefetch(p + kPrefetchHorizon); + RequestPrefetch(p + kBlock0Size + kPrefetchHorizon); + RequestPrefetch(p + 2 * kBlock0Size + kPrefetchHorizon); + + // Process 64 bytes at a time. + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + STEP8X3(l64, l641, l642, kBlock0Size); + } + + // Combine results. + SKIP_BLOCK(l64, kBlock0SkipTable); + l64 ^= l641; + SKIP_BLOCK(l64, kBlock0SkipTable); + l64 ^= l642; + p += (kGroups - 1) * kBlock0Size; + } + while ((e - p) >= kGroups * kBlock1Size) { + uint64_t l641 = 0; + uint64_t l642 = 0; + for (int i = 0; i < kBlock1Size; i += 8) { + STEP8X3(l64, l641, l642, kBlock1Size); + } + SKIP_BLOCK(l64, kBlock1SkipTable); + l64 ^= l641; + SKIP_BLOCK(l64, kBlock1SkipTable); + l64 ^= l642; + p += (kGroups - 1) * kBlock1Size; + } + while ((e - p) >= kGroups * kBlock2Size) { + uint64_t l641 = 0; + uint64_t l642 = 0; + for (int i = 0; i < kBlock2Size; i += 8) { + STEP8X3(l64, l641, l642, kBlock2Size); + } + SKIP_BLOCK(l64, kBlock2SkipTable); + l64 ^= l641; + SKIP_BLOCK(l64, kBlock2SkipTable); + l64 ^= l642; + p += (kGroups - 1) * kBlock2Size; + } + + // Process bytes 16 at a time + while ((e - p) >= 16) { + STEP8(l64, p); + STEP8(l64, p); + } + + l = static_cast<uint32_t>(l64); + // Process the last few bytes. + while (p != e) { + STEP1; + } +#undef SKIP_BLOCK +#undef STEP8X3 +#undef STEP8BY3 +#undef STEP8 +#undef STEP4 +#undef STEP1 + + return l ^ kCRC32Xor; +} + +} // namespace crc32c + +#endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) diff --git a/third_party/crc32c/src/src/crc32c_sse42.h b/third_party/crc32c/src/src/crc32c_sse42.h new file mode 100644 index 0000000000..b9ed179e54 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_sse42.h @@ -0,0 +1,31 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef CRC32C_CRC32C_SSE42_H_ +#define CRC32C_CRC32C_SSE42_H_ + +// X86-specific code. + +#include <cstddef> +#include <cstdint> + +#include "crc32c/crc32c_config.h" + +// The hardware-accelerated implementation is only enabled for 64-bit builds, +// because a straightforward 32-bit implementation actually runs slower than the +// portable version. Most X86 machines are 64-bit nowadays, so it doesn't make +// much sense to spend time building an optimized hardware-accelerated +// implementation. +#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +namespace crc32c { + +// SSE4.2-accelerated implementation in crc32c_sse42.cc +uint32_t ExtendSse42(uint32_t crc, const uint8_t* data, size_t count); + +} // namespace crc32c + +#endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +#endif // CRC32C_CRC32C_SSE42_H_ diff --git a/third_party/crc32c/src/src/crc32c_sse42_check.h b/third_party/crc32c/src/src/crc32c_sse42_check.h new file mode 100644 index 0000000000..ad380dd20e --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_sse42_check.h @@ -0,0 +1,48 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef CRC32C_CRC32C_SSE42_CHECK_H_ +#define CRC32C_CRC32C_SSE42_CHECK_H_ + +// X86-specific code checking the availability of SSE4.2 instructions. + +#include <cstddef> +#include <cstdint> + +#include "crc32c/crc32c_config.h" + +#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +// If the compiler supports SSE4.2, it definitely supports X86. + +#if defined(_MSC_VER) +#include <intrin.h> + +namespace crc32c { + +inline bool CanUseSse42() { + int cpu_info[4]; + __cpuid(cpu_info, 1); + return (cpu_info[2] & (1 << 20)) != 0; +} + +} // namespace crc32c + +#else // !defined(_MSC_VER) +#include <cpuid.h> + +namespace crc32c { + +inline bool CanUseSse42() { + unsigned int eax, ebx, ecx, edx; + return __get_cpuid(1, &eax, &ebx, &ecx, &edx) && ((ecx & (1 << 20)) != 0); +} + +} // namespace crc32c + +#endif // defined(_MSC_VER) + +#endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +#endif // CRC32C_CRC32C_SSE42_CHECK_H_ diff --git a/third_party/crc32c/src/src/crc32c_sse42_unittest.cc b/third_party/crc32c/src/src/crc32c_sse42_unittest.cc new file mode 100644 index 0000000000..c73ad8ddd1 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_sse42_unittest.cc @@ -0,0 +1,24 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "gtest/gtest.h" + +#include "./crc32c_extend_unittests.h" +#include "./crc32c_sse42.h" + +namespace crc32c { + +#if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +struct Sse42TestTraits { + static uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) { + return ExtendSse42(crc, data, count); + } +}; + +INSTANTIATE_TYPED_TEST_SUITE_P(Sse42, ExtendTest, Sse42TestTraits); + +#endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__)) + +} // namespace crc32c diff --git a/third_party/crc32c/src/src/crc32c_test_main.cc b/third_party/crc32c/src/src/crc32c_test_main.cc new file mode 100644 index 0000000000..c07e1c8bc4 --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_test_main.cc @@ -0,0 +1,20 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "crc32c/crc32c_config.h" + +#include "gtest/gtest.h" + +#if CRC32C_TESTS_BUILT_WITH_GLOG +#include "glog/logging.h" +#endif // CRC32C_TESTS_BUILT_WITH_GLOG + +int main(int argc, char** argv) { +#if CRC32C_TESTS_BUILT_WITH_GLOG + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); +#endif // CRC32C_TESTS_BUILT_WITH_GLOG + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/third_party/crc32c/src/src/crc32c_unittest.cc b/third_party/crc32c/src/src/crc32c_unittest.cc new file mode 100644 index 0000000000..d6c6af680c --- /dev/null +++ b/third_party/crc32c/src/src/crc32c_unittest.cc @@ -0,0 +1,129 @@ +// Copyright 2017 The CRC32C Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "crc32c/crc32c.h" + +#include <cstddef> +#include <cstdint> +#include <cstring> + +#include "gtest/gtest.h" + +#include "./crc32c_extend_unittests.h" + +TEST(Crc32CTest, Crc32c) { + // From rfc3720 section B.4. + uint8_t buf[32]; + + std::memset(buf, 0, sizeof(buf)); + EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa), + crc32c::Crc32c(buf, sizeof(buf))); + + std::memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43), + crc32c::Crc32c(buf, sizeof(buf))); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<uint8_t>(i); + EXPECT_EQ(static_cast<uint32_t>(0x46dd794e), + crc32c::Crc32c(buf, sizeof(buf))); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<uint8_t>(31 - i); + EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c), + crc32c::Crc32c(buf, sizeof(buf))); + + uint8_t data[48] = { + 0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + EXPECT_EQ(static_cast<uint32_t>(0xd9963a56), + crc32c::Crc32c(data, sizeof(data))); +} + +namespace crc32c { + +struct ApiTestTraits { + static uint32_t Extend(uint32_t crc, const uint8_t* data, size_t count) { + return ::crc32c::Extend(crc, data, count); + } +}; + +INSTANTIATE_TYPED_TEST_SUITE_P(Api, ExtendTest, ApiTestTraits); + +} // namespace crc32c + +TEST(CRC32CTest, Crc32cCharPointer) { + char buf[32]; + + std::memset(buf, 0, sizeof(buf)); + EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa), + crc32c::Crc32c(buf, sizeof(buf))); + + std::memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43), + crc32c::Crc32c(buf, sizeof(buf))); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<char>(i); + EXPECT_EQ(static_cast<uint32_t>(0x46dd794e), + crc32c::Crc32c(buf, sizeof(buf))); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<char>(31 - i); + EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c), + crc32c::Crc32c(buf, sizeof(buf))); +} + +TEST(CRC32CTest, Crc32cStdString) { + std::string buf; + buf.resize(32); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<char>(0x00); + EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa), crc32c::Crc32c(buf)); + + for (size_t i = 0; i < 32; ++i) + buf[i] = '\xff'; + EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43), crc32c::Crc32c(buf)); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<char>(i); + EXPECT_EQ(static_cast<uint32_t>(0x46dd794e), crc32c::Crc32c(buf)); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<char>(31 - i); + EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c), crc32c::Crc32c(buf)); +} + +#if __cplusplus > 201402L +#if __has_include(<string_view>) + +TEST(CRC32CTest, Crc32cStdStringView) { + uint8_t buf[32]; + std::string_view view(reinterpret_cast<const char*>(buf), sizeof(buf)); + + std::memset(buf, 0, sizeof(buf)); + EXPECT_EQ(static_cast<uint32_t>(0x8a9136aa), crc32c::Crc32c(view)); + + std::memset(buf, 0xff, sizeof(buf)); + EXPECT_EQ(static_cast<uint32_t>(0x62a8ab43), crc32c::Crc32c(view)); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<uint8_t>(i); + EXPECT_EQ(static_cast<uint32_t>(0x46dd794e), crc32c::Crc32c(view)); + + for (size_t i = 0; i < 32; ++i) + buf[i] = static_cast<uint8_t>(31 - i); + EXPECT_EQ(static_cast<uint32_t>(0x113fdb5c), crc32c::Crc32c(view)); +} + +#endif // __has_include(<string_view>) +#endif // __cplusplus > 201402L + +#define TESTED_EXTEND Extend +#include "./crc32c_extend_unittests.h" +#undef TESTED_EXTEND diff --git a/third_party/libaom/CMakeLists.txt b/third_party/libaom/CMakeLists.txt index 2e84ff8350..55ac2e0bc7 100644 --- a/third_party/libaom/CMakeLists.txt +++ b/third_party/libaom/CMakeLists.txt @@ -91,22 +91,27 @@ elseif(LINUX_AARCH64) LICENSE "BSD-3-Clause" SRC + ${AOM_ROOT}/libaom/aom_ports/arm_cpudetect.c ${aom_av1_common_intrin_neon} ${aom_av1_common_sources} ${aom_av1_decoder_sources} + ${aom_av1_encoder_intrin_neon} + ${aom_av1_encoder_sources} ${aom_dsp_common_intrin_neon} ${aom_dsp_common_sources} ${aom_dsp_decoder_sources} + ${aom_dsp_encoder_intrin_neon} + ${aom_dsp_encoder_sources} ${aom_mem_sources} - ${AOM_ROOT}/libaom/aom_ports/arm_cpudetect.c ${aom_rtcd_sources} ${aom_scale_sources} ${aom_sources} ${aom_util_sources}) target_include_directories( webrtc_libaom - PRIVATE ${AOM_ROOT}/config ${AOM_ROOT}/config/arm64 + PRIVATE ${AOM_ROOT}/config ${AOM_ROOT}/config/linux/arm64 PUBLIC ${AOM_ROOT}/libaom) +# target_compile_options(webrtc_libaom PRIVATE "-mfpu=neon") else() message(FATAL_ERROR "This can only be used in linux builds") endif() diff --git a/third_party/libaom/libaom_src.cmake b/third_party/libaom/libaom_src.cmake index 0f60cfc0ea..b582a517af 100644 --- a/third_party/libaom/libaom_src.cmake +++ b/third_party/libaom/libaom_src.cmake @@ -288,6 +288,8 @@ set(aom_av1_encoder_sources "${AOM_ROOT}/libaom/av1/encoder/extend.c" + "${AOM_ROOT}/libaom/av1/encoder/external_partition.c" + "${AOM_ROOT}/libaom/av1/encoder/hash.c" @@ -609,6 +611,8 @@ set(aom_dsp_encoder_sources "${AOM_ROOT}/libaom/aom_dsp/sad.c" "${AOM_ROOT}/libaom/aom_dsp/sad_av1.c" "${AOM_ROOT}/libaom/aom_dsp/sse.c" + "${AOM_ROOT}/libaom/aom_dsp/ssim.c" + "${AOM_ROOT}/libaom/aom_dsp/sum_squares.c" "${AOM_ROOT}/libaom/aom_dsp/variance.c" @@ -660,6 +664,7 @@ set(aom_sources + "${AOM_ROOT}/libaom/aom/src/aom_codec.c" "${AOM_ROOT}/libaom/aom/src/aom_decoder.c" "${AOM_ROOT}/libaom/aom/src/aom_encoder.c" diff --git a/third_party/libaom/source/config/config/aom_version.h b/third_party/libaom/source/config/config/aom_version.h index 9c9e03e384..d62ceb34f7 100644 --- a/third_party/libaom/source/config/config/aom_version.h +++ b/third_party/libaom/source/config/config/aom_version.h @@ -9,11 +9,11 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#define VERSION_MAJOR 2 -#define VERSION_MINOR 0 -#define VERSION_PATCH 2 -#define VERSION_EXTRA "1395-g79b775799" +#define VERSION_MAJOR 3 +#define VERSION_MINOR 1 +#define VERSION_PATCH 0 +#define VERSION_EXTRA "309-g12287adee" #define VERSION_PACKED \ ((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH)) -#define VERSION_STRING_NOSP "2.0.2-1395-g79b775799" -#define VERSION_STRING " 2.0.2-1395-g79b775799" +#define VERSION_STRING_NOSP "3.1.0-309-g12287adee" +#define VERSION_STRING " 3.1.0-309-g12287adee" diff --git a/third_party/libaom/source/config/ios/arm-neon/config/aom_config.asm b/third_party/libaom/source/config/ios/arm-neon/config/aom_config.asm index dcceb2497b..15c20d956a 100644 --- a/third_party/libaom/source/config/ios/arm-neon/config/aom_config.asm +++ b/third_party/libaom/source/config/ios/arm-neon/config/aom_config.asm @@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1 CONFIG_AV1_HIGHBITDEPTH equ 0 CONFIG_AV1_TEMPORAL_DENOISING equ 1 CONFIG_BIG_ENDIAN equ 0 +CONFIG_BITRATE_ACCURACY equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 CONFIG_COLLECT_COMPONENT_TIMING equ 0 @@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0 +CONFIG_FRAME_PARALLEL_ENCODE equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 CONFIG_GPROF equ 0 @@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0 CONFIG_NORMAL_TILE_MODE equ 1 CONFIG_OPTICAL_FLOW_API equ 0 CONFIG_OS_SUPPORT equ 1 +CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 CONFIG_REALTIME_ONLY equ 1 @@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1 CONFIG_SPEED_STATS equ 0 CONFIG_TUNE_BUTTERAUGLI equ 0 CONFIG_TUNE_VMAF equ 0 -CONFIG_USE_VMAF_RC equ 0 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 DECODE_WIDTH_LIMIT equ 16384 diff --git a/third_party/libaom/source/config/ios/arm-neon/config/aom_config.h b/third_party/libaom/source/config/ios/arm-neon/config/aom_config.h index 655ca4c8dc..f79ffc6929 100644 --- a/third_party/libaom/source/config/ios/arm-neon/config/aom_config.h +++ b/third_party/libaom/source/config/ios/arm-neon/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/ios/arm-neon/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/ios/arm-neon/config/aom_dsp_rtcd.h index 027c19a09f..e71ec66a00 100644 --- a/third_party/libaom/source/config/ios/arm-neon/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/ios/arm-neon/config/aom_dsp_rtcd.h @@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a, int height); #define aom_sse aom_sse_neon +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/ios/arm64/config/aom_config.asm b/third_party/libaom/source/config/ios/arm64/config/aom_config.asm index dcceb2497b..15c20d956a 100644 --- a/third_party/libaom/source/config/ios/arm64/config/aom_config.asm +++ b/third_party/libaom/source/config/ios/arm64/config/aom_config.asm @@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1 CONFIG_AV1_HIGHBITDEPTH equ 0 CONFIG_AV1_TEMPORAL_DENOISING equ 1 CONFIG_BIG_ENDIAN equ 0 +CONFIG_BITRATE_ACCURACY equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 CONFIG_COLLECT_COMPONENT_TIMING equ 0 @@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0 +CONFIG_FRAME_PARALLEL_ENCODE equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 CONFIG_GPROF equ 0 @@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0 CONFIG_NORMAL_TILE_MODE equ 1 CONFIG_OPTICAL_FLOW_API equ 0 CONFIG_OS_SUPPORT equ 1 +CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 CONFIG_REALTIME_ONLY equ 1 @@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1 CONFIG_SPEED_STATS equ 0 CONFIG_TUNE_BUTTERAUGLI equ 0 CONFIG_TUNE_VMAF equ 0 -CONFIG_USE_VMAF_RC equ 0 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 DECODE_WIDTH_LIMIT equ 16384 diff --git a/third_party/libaom/source/config/ios/arm64/config/aom_config.h b/third_party/libaom/source/config/ios/arm64/config/aom_config.h index 655ca4c8dc..f79ffc6929 100644 --- a/third_party/libaom/source/config/ios/arm64/config/aom_config.h +++ b/third_party/libaom/source/config/ios/arm64/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/ios/arm64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/ios/arm64/config/aom_dsp_rtcd.h index 027c19a09f..e71ec66a00 100644 --- a/third_party/libaom/source/config/ios/arm64/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/ios/arm64/config/aom_dsp_rtcd.h @@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a, int height); #define aom_sse aom_sse_neon +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.asm b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.asm index fbaae3b28d..ac5f20f3b1 100644 --- a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.asm +++ b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.asm @@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1 CONFIG_AV1_HIGHBITDEPTH equ 0 CONFIG_AV1_TEMPORAL_DENOISING equ 1 CONFIG_BIG_ENDIAN equ 0 +CONFIG_BITRATE_ACCURACY equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 CONFIG_COLLECT_COMPONENT_TIMING equ 0 @@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0 +CONFIG_FRAME_PARALLEL_ENCODE equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 CONFIG_GPROF equ 0 @@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0 CONFIG_NORMAL_TILE_MODE equ 1 CONFIG_OPTICAL_FLOW_API equ 0 CONFIG_OS_SUPPORT equ 1 +CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 CONFIG_REALTIME_ONLY equ 1 @@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1 CONFIG_SPEED_STATS equ 0 CONFIG_TUNE_BUTTERAUGLI equ 0 CONFIG_TUNE_VMAF equ 0 -CONFIG_USE_VMAF_RC equ 0 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 DECODE_WIDTH_LIMIT equ 16384 diff --git a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.h b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.h index adb548aa40..c8e44f4edd 100644 --- a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.h +++ b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_dsp_rtcd.h index 61141406d2..a4df74d40d 100644 --- a/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/linux/arm-neon-cpu-detect/config/aom_dsp_rtcd.h @@ -3813,6 +3813,17 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a, int width, int height); +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/linux/arm-neon/config/aom_config.asm b/third_party/libaom/source/config/linux/arm-neon/config/aom_config.asm index dcceb2497b..15c20d956a 100644 --- a/third_party/libaom/source/config/linux/arm-neon/config/aom_config.asm +++ b/third_party/libaom/source/config/linux/arm-neon/config/aom_config.asm @@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1 CONFIG_AV1_HIGHBITDEPTH equ 0 CONFIG_AV1_TEMPORAL_DENOISING equ 1 CONFIG_BIG_ENDIAN equ 0 +CONFIG_BITRATE_ACCURACY equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 CONFIG_COLLECT_COMPONENT_TIMING equ 0 @@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0 +CONFIG_FRAME_PARALLEL_ENCODE equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 CONFIG_GPROF equ 0 @@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0 CONFIG_NORMAL_TILE_MODE equ 1 CONFIG_OPTICAL_FLOW_API equ 0 CONFIG_OS_SUPPORT equ 1 +CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 CONFIG_REALTIME_ONLY equ 1 @@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1 CONFIG_SPEED_STATS equ 0 CONFIG_TUNE_BUTTERAUGLI equ 0 CONFIG_TUNE_VMAF equ 0 -CONFIG_USE_VMAF_RC equ 0 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 DECODE_WIDTH_LIMIT equ 16384 diff --git a/third_party/libaom/source/config/linux/arm-neon/config/aom_config.h b/third_party/libaom/source/config/linux/arm-neon/config/aom_config.h index 655ca4c8dc..f79ffc6929 100644 --- a/third_party/libaom/source/config/linux/arm-neon/config/aom_config.h +++ b/third_party/libaom/source/config/linux/arm-neon/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/arm-neon/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/arm-neon/config/aom_dsp_rtcd.h index 027c19a09f..e71ec66a00 100644 --- a/third_party/libaom/source/config/linux/arm-neon/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/linux/arm-neon/config/aom_dsp_rtcd.h @@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a, int height); #define aom_sse aom_sse_neon +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/linux/arm/config/aom_config.asm b/third_party/libaom/source/config/linux/arm/config/aom_config.asm index e9000243ad..bc1a95f003 100644 --- a/third_party/libaom/source/config/linux/arm/config/aom_config.asm +++ b/third_party/libaom/source/config/linux/arm/config/aom_config.asm @@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1 CONFIG_AV1_HIGHBITDEPTH equ 0 CONFIG_AV1_TEMPORAL_DENOISING equ 1 CONFIG_BIG_ENDIAN equ 0 +CONFIG_BITRATE_ACCURACY equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 CONFIG_COLLECT_COMPONENT_TIMING equ 0 @@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0 +CONFIG_FRAME_PARALLEL_ENCODE equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 CONFIG_GPROF equ 0 @@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0 CONFIG_NORMAL_TILE_MODE equ 1 CONFIG_OPTICAL_FLOW_API equ 0 CONFIG_OS_SUPPORT equ 1 +CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 CONFIG_REALTIME_ONLY equ 1 @@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1 CONFIG_SPEED_STATS equ 0 CONFIG_TUNE_BUTTERAUGLI equ 0 CONFIG_TUNE_VMAF equ 0 -CONFIG_USE_VMAF_RC equ 0 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 DECODE_WIDTH_LIMIT equ 16384 diff --git a/third_party/libaom/source/config/linux/arm/config/aom_config.h b/third_party/libaom/source/config/linux/arm/config/aom_config.h index 0404a4c827..f3ac36f68c 100644 --- a/third_party/libaom/source/config/linux/arm/config/aom_config.h +++ b/third_party/libaom/source/config/linux/arm/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/arm/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/arm/config/aom_dsp_rtcd.h index d7b1b04f00..8710d625ed 100644 --- a/third_party/libaom/source/config/linux/arm/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/linux/arm/config/aom_dsp_rtcd.h @@ -2953,6 +2953,17 @@ int64_t aom_sse_c(const uint8_t* a, int height); #define aom_sse aom_sse_c +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/linux/arm64/config/aom_config.asm b/third_party/libaom/source/config/linux/arm64/config/aom_config.asm index dcceb2497b..15c20d956a 100644 --- a/third_party/libaom/source/config/linux/arm64/config/aom_config.asm +++ b/third_party/libaom/source/config/linux/arm64/config/aom_config.asm @@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1 CONFIG_AV1_HIGHBITDEPTH equ 0 CONFIG_AV1_TEMPORAL_DENOISING equ 1 CONFIG_BIG_ENDIAN equ 0 +CONFIG_BITRATE_ACCURACY equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 CONFIG_COLLECT_COMPONENT_TIMING equ 0 @@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0 +CONFIG_FRAME_PARALLEL_ENCODE equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 CONFIG_GPROF equ 0 @@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0 CONFIG_NORMAL_TILE_MODE equ 1 CONFIG_OPTICAL_FLOW_API equ 0 CONFIG_OS_SUPPORT equ 1 +CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 CONFIG_REALTIME_ONLY equ 1 @@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1 CONFIG_SPEED_STATS equ 0 CONFIG_TUNE_BUTTERAUGLI equ 0 CONFIG_TUNE_VMAF equ 0 -CONFIG_USE_VMAF_RC equ 0 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 DECODE_WIDTH_LIMIT equ 16384 diff --git a/third_party/libaom/source/config/linux/arm64/config/aom_config.h b/third_party/libaom/source/config/linux/arm64/config/aom_config.h index 655ca4c8dc..f79ffc6929 100644 --- a/third_party/libaom/source/config/linux/arm64/config/aom_config.h +++ b/third_party/libaom/source/config/linux/arm64/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/arm64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/arm64/config/aom_dsp_rtcd.h index 027c19a09f..e71ec66a00 100644 --- a/third_party/libaom/source/config/linux/arm64/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/linux/arm64/config/aom_dsp_rtcd.h @@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a, int height); #define aom_sse aom_sse_neon +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/linux/generic/config/aom_config.asm b/third_party/libaom/source/config/linux/generic/config/aom_config.asm index 0e681a032e..24b965dfb3 100644 --- a/third_party/libaom/source/config/linux/generic/config/aom_config.asm +++ b/third_party/libaom/source/config/linux/generic/config/aom_config.asm @@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1 CONFIG_AV1_HIGHBITDEPTH equ 0 CONFIG_AV1_TEMPORAL_DENOISING equ 1 CONFIG_BIG_ENDIAN equ 0 +CONFIG_BITRATE_ACCURACY equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 CONFIG_COLLECT_COMPONENT_TIMING equ 0 @@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0 +CONFIG_FRAME_PARALLEL_ENCODE equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 CONFIG_GPROF equ 0 @@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0 CONFIG_NORMAL_TILE_MODE equ 1 CONFIG_OPTICAL_FLOW_API equ 0 CONFIG_OS_SUPPORT equ 1 +CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 CONFIG_REALTIME_ONLY equ 1 @@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1 CONFIG_SPEED_STATS equ 0 CONFIG_TUNE_BUTTERAUGLI equ 0 CONFIG_TUNE_VMAF equ 0 -CONFIG_USE_VMAF_RC equ 0 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 DECODE_WIDTH_LIMIT equ 16384 diff --git a/third_party/libaom/source/config/linux/generic/config/aom_config.h b/third_party/libaom/source/config/linux/generic/config/aom_config.h index 0e1665a47e..cdb4794210 100644 --- a/third_party/libaom/source/config/linux/generic/config/aom_config.h +++ b/third_party/libaom/source/config/linux/generic/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/generic/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/generic/config/aom_dsp_rtcd.h index 05bfa838bb..702c1b809e 100644 --- a/third_party/libaom/source/config/linux/generic/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/linux/generic/config/aom_dsp_rtcd.h @@ -2953,6 +2953,17 @@ int64_t aom_sse_c(const uint8_t* a, int height); #define aom_sse aom_sse_c +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/linux/ia32/config/aom_config.asm b/third_party/libaom/source/config/linux/ia32/config/aom_config.asm index d8ec860317..f4e2dfb836 100644 --- a/third_party/libaom/source/config/linux/ia32/config/aom_config.asm +++ b/third_party/libaom/source/config/linux/ia32/config/aom_config.asm @@ -10,6 +10,7 @@ %define CONFIG_AV1_HIGHBITDEPTH 0 %define CONFIG_AV1_TEMPORAL_DENOISING 1 %define CONFIG_BIG_ENDIAN 0 +%define CONFIG_BITRATE_ACCURACY 0 %define CONFIG_BITSTREAM_DEBUG 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -21,6 +22,7 @@ %define CONFIG_DIST_8X8 0 %define CONFIG_ENTROPY_STATS 0 %define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +%define CONFIG_FRAME_PARALLEL_ENCODE 0 %define CONFIG_GCC 1 %define CONFIG_GCOV 0 %define CONFIG_GPROF 0 @@ -36,6 +38,7 @@ %define CONFIG_NORMAL_TILE_MODE 1 %define CONFIG_OPTICAL_FLOW_API 0 %define CONFIG_OS_SUPPORT 1 +%define CONFIG_PARTITION_SEARCH_ORDER 0 %define CONFIG_PIC 1 %define CONFIG_RD_DEBUG 0 %define CONFIG_REALTIME_ONLY 1 @@ -48,7 +51,6 @@ %define CONFIG_SPEED_STATS 0 %define CONFIG_TUNE_BUTTERAUGLI 0 %define CONFIG_TUNE_VMAF 0 -%define CONFIG_USE_VMAF_RC 0 %define CONFIG_WEBM_IO 1 %define DECODE_HEIGHT_LIMIT 16384 %define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/ia32/config/aom_config.h b/third_party/libaom/source/config/linux/ia32/config/aom_config.h index 53666caafa..1b3bba6797 100644 --- a/third_party/libaom/source/config/linux/ia32/config/aom_config.h +++ b/third_party/libaom/source/config/linux/ia32/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 1 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/ia32/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/ia32/config/aom_dsp_rtcd.h index 64bc1f4056..323c55e888 100644 --- a/third_party/libaom/source/config/linux/ia32/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/linux/ia32/config/aom_dsp_rtcd.h @@ -6787,6 +6787,17 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a, int width, int height); +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/linux/x64/config/aom_config.asm b/third_party/libaom/source/config/linux/x64/config/aom_config.asm index 0fdb4ea1e8..b15994bbd7 100644 --- a/third_party/libaom/source/config/linux/x64/config/aom_config.asm +++ b/third_party/libaom/source/config/linux/x64/config/aom_config.asm @@ -10,6 +10,7 @@ %define CONFIG_AV1_HIGHBITDEPTH 0 %define CONFIG_AV1_TEMPORAL_DENOISING 1 %define CONFIG_BIG_ENDIAN 0 +%define CONFIG_BITRATE_ACCURACY 0 %define CONFIG_BITSTREAM_DEBUG 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -21,6 +22,7 @@ %define CONFIG_DIST_8X8 0 %define CONFIG_ENTROPY_STATS 0 %define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +%define CONFIG_FRAME_PARALLEL_ENCODE 0 %define CONFIG_GCC 1 %define CONFIG_GCOV 0 %define CONFIG_GPROF 0 @@ -36,6 +38,7 @@ %define CONFIG_NORMAL_TILE_MODE 1 %define CONFIG_OPTICAL_FLOW_API 0 %define CONFIG_OS_SUPPORT 1 +%define CONFIG_PARTITION_SEARCH_ORDER 0 %define CONFIG_PIC 0 %define CONFIG_RD_DEBUG 0 %define CONFIG_REALTIME_ONLY 1 @@ -48,7 +51,6 @@ %define CONFIG_SPEED_STATS 0 %define CONFIG_TUNE_BUTTERAUGLI 0 %define CONFIG_TUNE_VMAF 0 -%define CONFIG_USE_VMAF_RC 0 %define CONFIG_WEBM_IO 1 %define DECODE_HEIGHT_LIMIT 16384 %define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/x64/config/aom_config.h b/third_party/libaom/source/config/linux/x64/config/aom_config.h index d026bc215f..d090f8398a 100644 --- a/third_party/libaom/source/config/linux/x64/config/aom_config.h +++ b/third_party/libaom/source/config/linux/x64/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/linux/x64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/linux/x64/config/aom_dsp_rtcd.h index 58de231219..dd561e4498 100644 --- a/third_party/libaom/source/config/linux/x64/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/linux/x64/config/aom_dsp_rtcd.h @@ -6814,6 +6814,26 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a, int width, int height); +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +void aom_ssim_parms_8x8_sse2(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2 + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/win/arm64/config/aom_config.asm b/third_party/libaom/source/config/win/arm64/config/aom_config.asm index dcceb2497b..15c20d956a 100644 --- a/third_party/libaom/source/config/win/arm64/config/aom_config.asm +++ b/third_party/libaom/source/config/win/arm64/config/aom_config.asm @@ -20,6 +20,7 @@ CONFIG_AV1_ENCODER equ 1 CONFIG_AV1_HIGHBITDEPTH equ 0 CONFIG_AV1_TEMPORAL_DENOISING equ 1 CONFIG_BIG_ENDIAN equ 0 +CONFIG_BITRATE_ACCURACY equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 CONFIG_COLLECT_COMPONENT_TIMING equ 0 @@ -31,6 +32,7 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_EXCLUDE_SIMD_MISMATCH equ 0 +CONFIG_FRAME_PARALLEL_ENCODE equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 CONFIG_GPROF equ 0 @@ -46,6 +48,7 @@ CONFIG_NN_V2 equ 0 CONFIG_NORMAL_TILE_MODE equ 1 CONFIG_OPTICAL_FLOW_API equ 0 CONFIG_OS_SUPPORT equ 1 +CONFIG_PARTITION_SEARCH_ORDER equ 0 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 CONFIG_REALTIME_ONLY equ 1 @@ -58,7 +61,6 @@ CONFIG_SPATIAL_RESAMPLING equ 1 CONFIG_SPEED_STATS equ 0 CONFIG_TUNE_BUTTERAUGLI equ 0 CONFIG_TUNE_VMAF equ 0 -CONFIG_USE_VMAF_RC equ 0 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 DECODE_WIDTH_LIMIT equ 16384 diff --git a/third_party/libaom/source/config/win/arm64/config/aom_config.h b/third_party/libaom/source/config/win/arm64/config/aom_config.h index 5ca170928b..c744a45ff4 100644 --- a/third_party/libaom/source/config/win/arm64/config/aom_config.h +++ b/third_party/libaom/source/config/win/arm64/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 0 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/win/arm64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/win/arm64/config/aom_dsp_rtcd.h index 027c19a09f..e71ec66a00 100644 --- a/third_party/libaom/source/config/win/arm64/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/win/arm64/config/aom_dsp_rtcd.h @@ -3433,6 +3433,17 @@ int64_t aom_sse_neon(const uint8_t* a, int height); #define aom_sse aom_sse_neon +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/win/ia32/config/aom_config.asm b/third_party/libaom/source/config/win/ia32/config/aom_config.asm index 789f7c98f7..ad1912f54d 100644 --- a/third_party/libaom/source/config/win/ia32/config/aom_config.asm +++ b/third_party/libaom/source/config/win/ia32/config/aom_config.asm @@ -10,6 +10,7 @@ %define CONFIG_AV1_HIGHBITDEPTH 0 %define CONFIG_AV1_TEMPORAL_DENOISING 1 %define CONFIG_BIG_ENDIAN 0 +%define CONFIG_BITRATE_ACCURACY 0 %define CONFIG_BITSTREAM_DEBUG 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -21,6 +22,7 @@ %define CONFIG_DIST_8X8 0 %define CONFIG_ENTROPY_STATS 0 %define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +%define CONFIG_FRAME_PARALLEL_ENCODE 0 %define CONFIG_GCC 0 %define CONFIG_GCOV 0 %define CONFIG_GPROF 0 @@ -36,6 +38,7 @@ %define CONFIG_NORMAL_TILE_MODE 1 %define CONFIG_OPTICAL_FLOW_API 0 %define CONFIG_OS_SUPPORT 1 +%define CONFIG_PARTITION_SEARCH_ORDER 0 %define CONFIG_PIC 1 %define CONFIG_RD_DEBUG 0 %define CONFIG_REALTIME_ONLY 1 @@ -48,7 +51,6 @@ %define CONFIG_SPEED_STATS 0 %define CONFIG_TUNE_BUTTERAUGLI 0 %define CONFIG_TUNE_VMAF 0 -%define CONFIG_USE_VMAF_RC 0 %define CONFIG_WEBM_IO 1 %define DECODE_HEIGHT_LIMIT 16384 %define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/win/ia32/config/aom_config.h b/third_party/libaom/source/config/win/ia32/config/aom_config.h index e9cafd4296..044ba296e6 100644 --- a/third_party/libaom/source/config/win/ia32/config/aom_config.h +++ b/third_party/libaom/source/config/win/ia32/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 0 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 1 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/win/ia32/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/win/ia32/config/aom_dsp_rtcd.h index 64bc1f4056..323c55e888 100644 --- a/third_party/libaom/source/config/win/ia32/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/win/ia32/config/aom_dsp_rtcd.h @@ -6787,6 +6787,17 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a, int width, int height); +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_c + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/config/win/x64/config/aom_config.asm b/third_party/libaom/source/config/win/x64/config/aom_config.asm index bdebbbe6b3..f3e1660a08 100644 --- a/third_party/libaom/source/config/win/x64/config/aom_config.asm +++ b/third_party/libaom/source/config/win/x64/config/aom_config.asm @@ -10,6 +10,7 @@ %define CONFIG_AV1_HIGHBITDEPTH 0 %define CONFIG_AV1_TEMPORAL_DENOISING 1 %define CONFIG_BIG_ENDIAN 0 +%define CONFIG_BITRATE_ACCURACY 0 %define CONFIG_BITSTREAM_DEBUG 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -21,6 +22,7 @@ %define CONFIG_DIST_8X8 0 %define CONFIG_ENTROPY_STATS 0 %define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +%define CONFIG_FRAME_PARALLEL_ENCODE 0 %define CONFIG_GCC 0 %define CONFIG_GCOV 0 %define CONFIG_GPROF 0 @@ -36,6 +38,7 @@ %define CONFIG_NORMAL_TILE_MODE 1 %define CONFIG_OPTICAL_FLOW_API 0 %define CONFIG_OS_SUPPORT 1 +%define CONFIG_PARTITION_SEARCH_ORDER 0 %define CONFIG_PIC 0 %define CONFIG_RD_DEBUG 0 %define CONFIG_REALTIME_ONLY 1 @@ -48,7 +51,6 @@ %define CONFIG_SPEED_STATS 0 %define CONFIG_TUNE_BUTTERAUGLI 0 %define CONFIG_TUNE_VMAF 0 -%define CONFIG_USE_VMAF_RC 0 %define CONFIG_WEBM_IO 1 %define DECODE_HEIGHT_LIMIT 16384 %define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/win/x64/config/aom_config.h b/third_party/libaom/source/config/win/x64/config/aom_config.h index 6187935081..1adc7b7407 100644 --- a/third_party/libaom/source/config/win/x64/config/aom_config.h +++ b/third_party/libaom/source/config/win/x64/config/aom_config.h @@ -22,6 +22,7 @@ #define CONFIG_AV1_HIGHBITDEPTH 0 #define CONFIG_AV1_TEMPORAL_DENOISING 1 #define CONFIG_BIG_ENDIAN 0 +#define CONFIG_BITRATE_ACCURACY 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_COLLECT_COMPONENT_TIMING 0 @@ -33,6 +34,7 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_EXCLUDE_SIMD_MISMATCH 0 +#define CONFIG_FRAME_PARALLEL_ENCODE 0 #define CONFIG_GCC 0 #define CONFIG_GCOV 0 #define CONFIG_GPROF 0 @@ -48,6 +50,7 @@ #define CONFIG_NORMAL_TILE_MODE 1 #define CONFIG_OPTICAL_FLOW_API 0 #define CONFIG_OS_SUPPORT 1 +#define CONFIG_PARTITION_SEARCH_ORDER 0 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 #define CONFIG_REALTIME_ONLY 1 @@ -60,7 +63,6 @@ #define CONFIG_SPEED_STATS 0 #define CONFIG_TUNE_BUTTERAUGLI 0 #define CONFIG_TUNE_VMAF 0 -#define CONFIG_USE_VMAF_RC 0 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 #define DECODE_WIDTH_LIMIT 16384 diff --git a/third_party/libaom/source/config/win/x64/config/aom_dsp_rtcd.h b/third_party/libaom/source/config/win/x64/config/aom_dsp_rtcd.h index 58de231219..dd561e4498 100644 --- a/third_party/libaom/source/config/win/x64/config/aom_dsp_rtcd.h +++ b/third_party/libaom/source/config/win/x64/config/aom_dsp_rtcd.h @@ -6814,6 +6814,26 @@ RTCD_EXTERN int64_t (*aom_sse)(const uint8_t* a, int width, int height); +void aom_ssim_parms_8x8_c(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +void aom_ssim_parms_8x8_sse2(const uint8_t* s, + int sp, + const uint8_t* r, + int rp, + uint32_t* sum_s, + uint32_t* sum_r, + uint32_t* sum_sq_s, + uint32_t* sum_sq_r, + uint32_t* sum_sxr); +#define aom_ssim_parms_8x8 aom_ssim_parms_8x8_sse2 + uint32_t aom_sub_pixel_avg_variance128x128_c(const uint8_t* src_ptr, int source_stride, int xoffset, diff --git a/third_party/libaom/source/libaom/aom/aom.h b/third_party/libaom/source/libaom/aom/aom.h index c591dc9a43..0650a11f6b 100644 --- a/third_party/libaom/source/libaom/aom/aom.h +++ b/third_party/libaom/source/libaom/aom/aom.h @@ -41,27 +41,45 @@ extern "C" { /*!\brief Control functions * * The set of macros define the control functions of AOM interface + * The range for common control IDs is 230-255(max). */ enum aom_com_control_id { - /* TODO(https://crbug.com/aomedia/2671): The encoder overlaps the range of - * these values for its control ids, see the NOTEs in aom/aomcx.h. These - * should be migrated to something like the AOM_DECODER_CTRL_ID_START range - * next time we're ready to break the ABI. + /*!\brief Codec control function to get a pointer to a reference frame + * + * av1_ref_frame_t* parameter */ - AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame, - av1_ref_frame_t* parameter */ - AV1_SET_REFERENCE = 129, /**< write a frame into a reference buffer, - av1_ref_frame_t* parameter */ - AV1_COPY_REFERENCE = 130, /**< get a copy of reference frame from the decoderm - av1_ref_frame_t* parameter */ - AOM_COMMON_CTRL_ID_MAX, - - AV1_GET_NEW_FRAME_IMAGE = - 192, /**< get a pointer to the new frame, aom_image_t* parameter */ - AV1_COPY_NEW_FRAME_IMAGE = 193, /**< copy the new frame to an external buffer, - aom_image_t* parameter */ + AV1_GET_REFERENCE = 230, + /*!\brief Codec control function to write a frame into a reference buffer + * + * av1_ref_frame_t* parameter + */ + AV1_SET_REFERENCE = 231, + + /*!\brief Codec control function to get a copy of reference frame from the + * decoder + * + * av1_ref_frame_t* parameter + */ + AV1_COPY_REFERENCE = 232, + + /*!\brief Codec control function to get a pointer to the new frame + * + * aom_image_t* parameter + */ + AV1_GET_NEW_FRAME_IMAGE = 233, + + /*!\brief Codec control function to copy the new frame to an external buffer + * + * aom_image_t* parameter + */ + AV1_COPY_NEW_FRAME_IMAGE = 234, + + /*!\brief Start point of control IDs for aom_dec_control_id. + * Any new common control IDs should be added above. + */ AOM_DECODER_CTRL_ID_START = 256 + // No common control IDs should be added after AOM_DECODER_CTRL_ID_START. }; /*!\brief AV1 specific reference frame data struct diff --git a/third_party/libaom/source/libaom/aom/aom_codec.h b/third_party/libaom/source/libaom/aom/aom_codec.h index f58272ee2c..a2a9efaef3 100644 --- a/third_party/libaom/source/libaom/aom/aom_codec.h +++ b/third_party/libaom/source/libaom/aom/aom_codec.h @@ -149,7 +149,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define AOM_CODEC_ABI_VERSION (6 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/ +#define AOM_CODEC_ABI_VERSION (7 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/ /*!\brief Algorithm return codes */ typedef enum { diff --git a/third_party/libaom/source/libaom/aom/aom_encoder.h b/third_party/libaom/source/libaom/aom/aom_encoder.h index 48e705646d..a98c8d8270 100644 --- a/third_party/libaom/source/libaom/aom/aom_encoder.h +++ b/third_party/libaom/source/libaom/aom/aom_encoder.h @@ -31,6 +31,7 @@ extern "C" { #endif #include "aom/aom_codec.h" +#include "aom/aom_external_partition.h" /*!\brief Current ABI version number * @@ -41,7 +42,7 @@ extern "C" { * fields to structures */ #define AOM_ENCODER_ABI_VERSION \ - (9 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/ + (9 + AOM_CODEC_ABI_VERSION + AOM_EXT_PART_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -142,15 +143,8 @@ typedef struct aom_codec_cx_pkt { double psnr_hbd[4]; } psnr; /**< data for PSNR packet */ aom_fixed_buf_t raw; /**< data for arbitrary packets */ - - /* This packet size is fixed to allow codecs to extend this - * interface without having to manage storage for raw packets, - * i.e., if it's smaller than 128 bytes, you can store in the - * packet list directly. - */ - char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */ - } data; /**< packet data */ -} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */ + } data; /**< packet data */ +} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */ /*!\brief Rational Number * @@ -300,10 +294,6 @@ typedef struct cfg_options { * */ unsigned int disable_smooth_intra; - /*!\brief disable D45 to D203 intra modes - * - */ - unsigned int disable_diagonal_intra; /*!\brief disable filter intra * */ @@ -880,11 +870,11 @@ typedef struct aom_codec_enc_cfg { */ unsigned int use_fixed_qp_offsets; -/*!\brief Number of fixed QP offsets +/*!\brief Max number of fixed QP offsets * * This defines the number of elements in the fixed_qp_offsets array. */ -#define FIXED_QP_OFFSET_COUNT 5 +#define FIXED_QP_OFFSET_COUNT 6 /*!\brief Array of fixed QP offsets * diff --git a/third_party/libaom/source/libaom/aom/aom_external_partition.h b/third_party/libaom/source/libaom/aom/aom_external_partition.h new file mode 100644 index 0000000000..3710466316 --- /dev/null +++ b/third_party/libaom/source/libaom/aom/aom_external_partition.h @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AOM_AOM_EXTERNAL_PARTITION_H_ +#define AOM_AOM_AOM_EXTERNAL_PARTITION_H_ + +/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder + * \ingroup aom + * + * @{ + */ +#include "./aom_integer.h" + +/*!\file + * \brief Provides function pointer definitions for the external partition. + */ + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures. + */ +#define AOM_EXT_PART_ABI_VERSION (1) + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Abstract external partition model handler + */ +typedef void *aom_ext_part_model_t; + +/*!\brief Number of features to determine whether to skip partition none and + * do partition split directly. The same as "FEATURE_SIZE_SMS_SPLIT". + */ +#define SIZE_DIRECT_SPLIT 17 + +/*!\brief Number of features to use simple motion search to prune out + * rectangular partition in some direction. The same as + * "FEATURE_SIZE_SMS_PRUNE_PART". + */ +#define SIZE_PRUNE_PART 25 + +/*!\brief Number of features to prune split and rectangular partition + * after PARTITION_NONE. + */ +#define SIZE_PRUNE_NONE 4 + +/*!\brief Number of features to terminates partition after partition none using + * simple_motion_search features and the rate, distortion, and rdcost of + * PARTITION_NONE. The same as "FEATURE_SIZE_SMS_TERM_NONE". + */ +#define SIZE_TERM_NONE 28 + +/*!\brief Number of features to terminates partition after partition split. + */ +#define SIZE_TERM_SPLIT 31 + +/*!\brief Number of features to prune rectangular partition using stats + * collected after partition split. + */ +#define SIZE_PRUNE_RECT 9 + +/*!\brief Number of features to prune AB partition using stats + * collected after rectangular partition.. + */ +#define SIZE_PRUNE_AB 10 + +/*!\brief Number of features to prune 4-way partition using stats + * collected after AB partition. + */ +#define SIZE_PRUNE_4_WAY 18 + +/*!\brief Config information sent to the external partition model. + * + * For example, the maximum superblock size determined by the sequence header. + */ +typedef struct aom_ext_part_config { + int superblock_size; /**< super block size (either 64x64 or 128x128) */ +} aom_ext_part_config_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected before NONE partition. + * Features "f" are used to determine: + * partition_none_allowed, partition_horz_allowed, partition_vert_allowed, + * do_rectangular_split, do_square_split + * Features "f_part2" are used to determine: + * prune_horz, prune_vert. + */ +typedef struct aom_partition_features_before_none { + float f[SIZE_DIRECT_SPLIT]; /**< features to determine whether skip partition + none and do split directly */ + float f_part2[SIZE_PRUNE_PART]; /**< features to determine whether to prune + rectangular partition */ +} aom_partition_features_before_none_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected after NONE partition. + */ +typedef struct aom_partition_features_none { + float f[SIZE_PRUNE_NONE]; /**< features to prune split and rectangular + partition*/ + float f_terminate[SIZE_TERM_NONE]; /**< features to determine termination of + partition */ +} aom_partition_features_none_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected after SPLIT partition. + */ +typedef struct aom_partition_features_split { + float f_terminate[SIZE_TERM_SPLIT]; /**< features to determine termination of + partition */ + float f_prune_rect[SIZE_PRUNE_RECT]; /**< features to determine pruning rect + partition */ +} aom_partition_features_split_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected after RECTANGULAR partition. + */ +typedef struct aom_partition_features_rect { + float f[SIZE_PRUNE_AB]; /**< features to determine pruning AB partition */ +} aom_partition_features_rect_t; + +/*!\brief Features pass to the external model to make partition decisions. + * Specifically, features collected after AB partition: HORZ_A, HORZ_B, VERT_A, + * VERT_B. + */ +typedef struct aom_partition_features_ab { + float + f[SIZE_PRUNE_4_WAY]; /**< features to determine pruning 4-way partition */ +} aom_partition_features_ab_t; + +/*!\brief Feature id to tell the external model the current stage in partition + * pruning and what features to use to make decisions accordingly. + */ +typedef enum { + FEATURE_BEFORE_PART_NONE, + FEATURE_BEFORE_PART_NONE_PART2, + FEATURE_AFTER_PART_NONE, + FEATURE_AFTER_PART_NONE_PART2, + FEATURE_AFTER_PART_SPLIT, + FEATURE_AFTER_PART_SPLIT_PART2, + FEATURE_AFTER_PART_RECT, + FEATURE_AFTER_PART_AB +} PART_FEATURE_ID; + +/*!\brief Features pass to the external model to make partition decisions. + * + * The encoder sends these features to the external model through + * "func()" defined in ..... + * + * NOTE: new member variables may be added to this structure in the future. + * Once new features are finalized, bump the major version of libaom. + */ +typedef struct aom_partition_features { + PART_FEATURE_ID id; /**< Feature ID to indicate active features */ + aom_partition_features_before_none_t + before_part_none; /**< Features collected before NONE partition */ + aom_partition_features_none_t + after_part_none; /**< Features collected after NONE partition */ + aom_partition_features_split_t + after_part_split; /**< Features collected after SPLIT partition */ + aom_partition_features_rect_t + after_part_rect; /**< Features collected after RECTANGULAR partition */ + aom_partition_features_ab_t + after_part_ab; /**< Features collected after AB partition */ +} aom_partition_features_t; + +/*!\brief Partition decisions received from the external model. + * + * The encoder receives partition decisions and encodes the superblock + * with the given partition type. + * The encoder receives it from "func()" define in .... + * + * NOTE: new member variables may be added to this structure in the future. + * Once new features are finalized, bump the major version of libaom. + */ +typedef struct aom_partition_decision { + // Decisions for directly set partition types + int is_final_decision; /**< The flag whether it is the final decision */ + int partition_decision[256]; /**< Partition decisions */ + + // Decisions for partition type pruning + int terminate_partition_search; /**< Terminate further partition search */ + int partition_none_allowed; /**< Allow partition none type */ + int partition_rect_allowed[2]; /**< Allow rectangular partitions */ + int do_rectangular_split; /**< Try rectangular split partition */ + int do_square_split; /**< Try square split partition */ + int prune_rect_part[2]; /**< Prune rectangular partition */ + int horza_partition_allowed; /**< Allow HORZ_A partitioin */ + int horzb_partition_allowed; /**< Allow HORZ_B partitioin */ + int verta_partition_allowed; /**< Allow VERT_A partitioin */ + int vertb_partition_allowed; /**< Allow VERT_B partitioin */ + int partition_horz4_allowed; /**< Allow HORZ4 partition */ + int partition_vert4_allowed; /**< Allow VERT4 partition */ +} aom_partition_decision_t; + +/*!\brief Encoding stats for the given partition decision. + * + * The encoding stats collected by encoding the superblock with the + * given partition types. + * The encoder sends the stats to the external model for training + * or inference though "func()" defined in .... + */ +typedef struct aom_partition_stats { + int rate; /**< Rate cost of the block */ + int64_t dist; /**< Distortion of the block */ + int64_t rdcost; /**< Rate-distortion cost of the block */ +} aom_partition_stats_t; + +/*!\brief Enum for return status. + */ +typedef enum aom_ext_part_status { + AOM_EXT_PART_OK = 0, /**< Status of success */ + AOM_EXT_PART_ERROR = 1, /**< Status of failure */ + AOM_EXT_PART_TEST = 2, /**< Status used for tests */ +} aom_ext_part_status_t; + +/*!\brief Callback of creating an external partition model. + * + * The callback is invoked by the encoder to create an external partition + * model. + * + * \param[in] priv Callback's private data + * \param[in] part_config Config information pointer for model creation + * \param[out] ext_part_model Pointer to the model + */ +typedef aom_ext_part_status_t (*aom_ext_part_create_model_fn_t)( + void *priv, const aom_ext_part_config_t *part_config, + aom_ext_part_model_t *ext_part_model); + +/*!\brief Callback of sending features to the external partition model. + * + * The callback is invoked by the encoder to send features to the external + * partition model. + * + * \param[in] ext_part_model The external model + * \param[in] part_features Pointer to the features + */ +typedef aom_ext_part_status_t (*aom_ext_part_send_features_fn_t)( + aom_ext_part_model_t ext_part_model, + const aom_partition_features_t *part_features); + +/*!\brief Callback of receiving partition decisions from the external + * partition model. + * + * The callback is invoked by the encoder to receive partition decisions from + * the external partition model. + * + * \param[in] ext_part_model The external model + * \param[in] ext_part_decision Pointer to the partition decisions + */ +typedef aom_ext_part_status_t (*aom_ext_part_get_decision_fn_t)( + aom_ext_part_model_t ext_part_model, + aom_partition_decision_t *ext_part_decision); + +/*!\brief Callback of sending stats to the external partition model. + * + * The callback is invoked by the encoder to send encoding stats to + * the external partition model. + * + * \param[in] ext_part_model The external model + * \param[in] ext_part_stats Pointer to the encoding stats + */ +typedef aom_ext_part_status_t (*aom_ext_part_send_partition_stats_fn_t)( + aom_ext_part_model_t ext_part_model, + const aom_partition_stats_t *ext_part_stats); + +/*!\brief Callback of deleting the external partition model. + * + * The callback is invoked by the encoder to delete the external partition + * model. + * + * \param[in] ext_part_model The external model + */ +typedef aom_ext_part_status_t (*aom_ext_part_delete_model_fn_t)( + aom_ext_part_model_t ext_part_model); + +/*!\brief Callback function set for external partition model. + * + * Uses can enable external partition model by registering a set of + * callback functions with the flag: AV1E_SET_EXTERNAL_PARTITION_MODEL + */ +typedef struct aom_ext_part_funcs { + /*! + * Create an external partition model. + */ + aom_ext_part_create_model_fn_t create_model; + + /*! + * Send features to the external partition model to make partition decisions. + */ + aom_ext_part_send_features_fn_t send_features; + + /*! + * Get partition decisions from the external partition model. + */ + aom_ext_part_get_decision_fn_t get_partition_decision; + + /*! + * Send stats of the current partition to the external model. + */ + aom_ext_part_send_partition_stats_fn_t send_partition_stats; + + /*! + * Delete the external partition model. + */ + aom_ext_part_delete_model_fn_t delete_model; + + /*! + * Private data for the external partition model. + */ + void *priv; +} aom_ext_part_funcs_t; + +/*!@} - end defgroup aom_encoder*/ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_AOM_EXTERNAL_PARTITION_H_ diff --git a/third_party/libaom/source/libaom/aom/aomcx.h b/third_party/libaom/source/libaom/aom/aomcx.h index 87f0b5db9b..8345911abd 100644 --- a/third_party/libaom/source/libaom/aom/aomcx.h +++ b/third_party/libaom/source/libaom/aom/aomcx.h @@ -18,6 +18,7 @@ */ #include "aom/aom.h" #include "aom/aom_encoder.h" +#include "aom/aom_external_partition.h" /*!\file * \brief Provides definitions for using AOM or AV1 encoder algorithm within the @@ -167,6 +168,7 @@ extern aom_codec_iface_t *aom_codec_av1_cx(void); * * This set of macros define the control functions available for AVx * encoder interface. + * The range of encode control ID is 7-229(max). * * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) */ @@ -221,10 +223,14 @@ enum aome_enc_control_id { /* NOTE: enum 15 unused */ - /*!\brief Codec control function to set loop filter sharpness, + /*!\brief Codec control function to set the sharpness parameter, * unsigned int parameter. * - * Valid range: 0..7. The default is 0. + * This parameter controls the level at which rate-distortion optimization of + * transform coefficients favours sharpness in the block. + * + * Valid range: 0..7. The default is 0. Values 1-7 will avoid eob and skip + * block optimization and will change rdmult in favour of block sharpness. */ AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2, // 16 @@ -1204,9 +1210,6 @@ enum aome_enc_control_id { parameter */ AV1E_SET_REDUCED_REFERENCE_SET = 125, - /* NOTE: enums 126-139 unused */ - /* NOTE: Need a gap in enum values to avoud conflict with 128, 129, 130 */ - /*!\brief Control to set frequency of the cost updates for coefficients, * unsigned int parameter * @@ -1215,7 +1218,7 @@ enum aome_enc_control_id { * - 2 = update at tile level * - 3 = turn off */ - AV1E_SET_COEFF_COST_UPD_FREQ = 140, + AV1E_SET_COEFF_COST_UPD_FREQ = 126, /*!\brief Control to set frequency of the cost updates for mode, unsigned int * parameter @@ -1225,7 +1228,7 @@ enum aome_enc_control_id { * - 2 = update at tile level * - 3 = turn off */ - AV1E_SET_MODE_COST_UPD_FREQ = 141, + AV1E_SET_MODE_COST_UPD_FREQ = 127, /*!\brief Control to set frequency of the cost updates for motion vectors, * unsigned int parameter @@ -1235,7 +1238,7 @@ enum aome_enc_control_id { * - 2 = update at tile level * - 3 = turn off */ - AV1E_SET_MV_COST_UPD_FREQ = 142, + AV1E_SET_MV_COST_UPD_FREQ = 128, /*!\brief Control to set bit mask that specifies which tier each of the 32 * possible operating points conforms to, unsigned int parameter @@ -1243,37 +1246,37 @@ enum aome_enc_control_id { * - 0 = main tier (default) * - 1 = high tier */ - AV1E_SET_TIER_MASK = 143, + AV1E_SET_TIER_MASK = 129, /*!\brief Control to set minimum compression ratio, unsigned int parameter * Take integer values. If non-zero, encoder will try to keep the compression * ratio of each frame to be higher than the given value divided by 100. * E.g. 850 means minimum compression ratio of 8.5. */ - AV1E_SET_MIN_CR = 144, + AV1E_SET_MIN_CR = 130, /* NOTE: enums 145-149 unused */ /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t* * parameter */ - AV1E_SET_SVC_LAYER_ID = 150, + AV1E_SET_SVC_LAYER_ID = 131, /*!\brief Codec control function to set SVC paramaeters, aom_svc_params_t* * parameter */ - AV1E_SET_SVC_PARAMS = 151, + AV1E_SET_SVC_PARAMS = 132, /*!\brief Codec control function to set reference frame config: * the ref_idx and the refresh flags for each buffer slot. * aom_svc_ref_frame_config_t* parameter */ - AV1E_SET_SVC_REF_FRAME_CONFIG = 152, + AV1E_SET_SVC_REF_FRAME_CONFIG = 133, /*!\brief Codec control function to set the path to the VMAF model used when * tuning the encoder for VMAF, const char* parameter */ - AV1E_SET_VMAF_MODEL_PATH = 153, + AV1E_SET_VMAF_MODEL_PATH = 134, /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder, * unsigned int parameter @@ -1283,7 +1286,7 @@ enum aome_enc_control_id { * * \note This is only used in lightfield example test. */ - AV1E_ENABLE_EXT_TILE_DEBUG = 154, + AV1E_ENABLE_EXT_TILE_DEBUG = 135, /*!\brief Codec control function to enable the superblock multipass unit test * in AV1 to ensure that the encoder does not leak state between different @@ -1294,30 +1297,30 @@ enum aome_enc_control_id { * * \note This is only used in sb_multipass unit test. */ - AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 155, + AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 136, /*!\brief Control to select minimum height for the GF group pyramid structure, * unsigned int parameter * * Valid values: 0..5 */ - AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 156, + AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 137, /*!\brief Control to set average complexity of the corpus in the case of * single pass vbr based on LAP, unsigned int parameter */ - AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 157, + AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP = 138, /*!\brief Control to get baseline gf interval */ - AV1E_GET_BASELINE_GF_INTERVAL = 158, + AV1E_GET_BASELINE_GF_INTERVAL = 139, /*\brief Control to set encoding the denoised frame from denoise-noise-level * * - 0 = disabled/encode the original frame * - 1 = enabled/encode the denoised frame (default) */ - AV1E_SET_ENABLE_DNL_DENOISING = 159, + AV1E_SET_ENABLE_DNL_DENOISING = 140, /*!\brief Codec control function to turn on / off D45 to D203 intra mode * usage, int parameter @@ -1327,7 +1330,32 @@ enum aome_enc_control_id { * - 0 = disable * - 1 = enable (default) */ - AV1E_SET_ENABLE_DIAGONAL_INTRA = 160, + AV1E_SET_ENABLE_DIAGONAL_INTRA = 141, + + /*!\brief Control to set frequency of the cost updates for intrabc motion + * vectors, unsigned int parameter + * + * - 0 = update at SB level (default) + * - 1 = update at SB row level in tile + * - 2 = update at tile level + * - 3 = turn off + */ + AV1E_SET_DV_COST_UPD_FREQ = 142, + + /*!\brief Codec control to set the path for partition stats read and write. + * const char * parameter. + */ + AV1E_SET_PARTITION_INFO_PATH = 143, + + /*!\brief Codec control to use an external partition model + * A set of callback functions is passed through this control + * to let the encoder encode with given partitions. + */ + AV1E_SET_EXTERNAL_PARTITION = 144, + + // Any new encoder control IDs should be added above. + // Maximum allowed encoder control ID is 229. + // No encoder control ID should be added below. }; /*!\brief aom 1-D scaling mode @@ -1858,6 +1886,15 @@ AOM_CTRL_USE_TYPE(AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, unsigned int) AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DNL_DENOISING, int) #define AOM_CTRL_AV1E_SET_ENABLE_DNL_DENOISING +AOM_CTRL_USE_TYPE(AV1E_SET_DV_COST_UPD_FREQ, unsigned int) +#define AOM_CTRL_AV1E_SET_DV_COST_UPD_FREQ + +AOM_CTRL_USE_TYPE(AV1E_SET_PARTITION_INFO_PATH, const char *) +#define AOM_CTRL_AV1E_SET_PARTITION_INFO_PATH + +AOM_CTRL_USE_TYPE(AV1E_SET_EXTERNAL_PARTITION, aom_ext_part_funcs_t *) +#define AOM_CTRL_AV1E_SET_ENABLE_DNL_DENOISING + /*!\endcond */ /*! @} - end defgroup aom_encoder */ #ifdef __cplusplus diff --git a/third_party/libaom/source/libaom/aom/aomdx.h b/third_party/libaom/source/libaom/aom/aomdx.h index aa4f435ec4..b3fd90e460 100644 --- a/third_party/libaom/source/libaom/aom/aomdx.h +++ b/third_party/libaom/source/libaom/aom/aomdx.h @@ -188,6 +188,7 @@ typedef struct av1_ext_ref_frame { * * This set of macros define the control functions available for the AOM * decoder interface. + * The range for decoder control ID is >= 256. * * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) */ @@ -381,8 +382,6 @@ enum aom_dec_control_id { */ AV1D_SET_SKIP_FILM_GRAIN, - AOM_DECODER_CTRL_ID_MAX, - /*!\brief Codec control function to check the presence of forward key frames */ AOMD_GET_FWD_KF_PRESENT, diff --git a/third_party/libaom/source/libaom/aom/internal/aom_codec_internal.h b/third_party/libaom/source/libaom/aom/internal/aom_codec_internal.h index 0ad33bdf2e..457da9244a 100644 --- a/third_party/libaom/source/libaom/aom/internal/aom_codec_internal.h +++ b/third_party/libaom/source/libaom/aom/internal/aom_codec_internal.h @@ -278,7 +278,7 @@ typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)( typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)( aom_codec_alg_priv_t *ctx); -/*!\brief Decoder algorithm interface interface +/*!\brief Decoder algorithm interface * * All decoders \ref MUST expose a variable of this type. */ diff --git a/third_party/libaom/source/libaom/aom/src/aom_image.c b/third_party/libaom/source/libaom/aom/src/aom_image.c index dfdee87d26..13f71b2bf5 100644 --- a/third_party/libaom/source/libaom/aom/src/aom_image.c +++ b/third_party/libaom/source/libaom/aom/src/aom_image.c @@ -38,6 +38,8 @@ static aom_image_t *img_alloc_helper( unsigned int h, w, s, xcs, ycs, bps, bit_depth; unsigned int stride_in_bytes; + if (img != NULL) memset(img, 0, sizeof(aom_image_t)); + /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -111,8 +113,6 @@ static aom_image_t *img_alloc_helper( if (!img) goto fail; img->self_allocd = 1; - } else { - memset(img, 0, sizeof(aom_image_t)); } img->img_data = img_data; diff --git a/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c b/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c index 6d41708ee0..945e7e48ee 100644 --- a/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c +++ b/third_party/libaom/source/libaom/aom_dsp/arm/intrapred_neon.c @@ -11,8 +11,6 @@ #include <arm_neon.h> -#include "common/tools_common.h" - #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" diff --git a/third_party/libaom/source/libaom/aom_dsp/butteraugli.c b/third_party/libaom/source/libaom/aom_dsp/butteraugli.c index 7ce2324c06..038efcd313 100644 --- a/third_party/libaom/source/libaom/aom_dsp/butteraugli.c +++ b/third_party/libaom/source/libaom/aom_dsp/butteraugli.c @@ -18,37 +18,71 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, - float *dist_map) { + aom_matrix_coefficients_t matrix_coefficients, + aom_color_range_t color_range, float *dist_map) { (void)bit_depth; assert(bit_depth == 8); - assert(source->y_width == source->uv_width * 2); const int width = source->y_crop_width; const int height = source->y_crop_height; + const int ss_x = source->subsampling_x; + const int ss_y = source->subsampling_y; - size_t buffer_size = width * height * 3; - uint8_t *src_rgb = (uint8_t *)aom_malloc(buffer_size); - uint8_t *distorted_rgb = (uint8_t *)aom_malloc(buffer_size); - if (!src_rgb || !distorted_rgb) { - aom_free(src_rgb); - aom_free(distorted_rgb); + const struct YuvConstants *yuv_constants; + if (matrix_coefficients == AOM_CICP_MC_BT_709) { + if (color_range == AOM_CR_FULL_RANGE) return 0; + yuv_constants = &kYuvH709Constants; + } else { + yuv_constants = color_range == AOM_CR_FULL_RANGE ? &kYuvJPEGConstants + : &kYuvI601Constants; + } + + const size_t stride_argb = width * 4; + const size_t buffer_size = height * stride_argb; + uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size); + uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size); + if (!src_argb || !distorted_argb) { + aom_free(src_argb); + aom_free(distorted_argb); return 0; } - I420ToRGB24Matrix(source->y_buffer, source->y_stride, source->u_buffer, - source->uv_stride, source->v_buffer, source->uv_stride, - src_rgb, width * 3, &kYuvH709Constants, width, height); - I420ToRGB24Matrix(distorted->y_buffer, distorted->y_stride, - distorted->u_buffer, distorted->uv_stride, - distorted->v_buffer, distorted->uv_stride, distorted_rgb, - width * 3, &kYuvH709Constants, width, height); + if (ss_x == 1 && ss_y == 1) { + I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, + source->uv_stride, source->v_buffer, source->uv_stride, + src_argb, stride_argb, yuv_constants, width, height); + I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride, + distorted->u_buffer, distorted->uv_stride, + distorted->v_buffer, distorted->uv_stride, distorted_argb, + stride_argb, yuv_constants, width, height); + } else if (ss_x == 1 && ss_y == 0) { + I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, + source->uv_stride, source->v_buffer, source->uv_stride, + src_argb, stride_argb, yuv_constants, width, height); + I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride, + distorted->u_buffer, distorted->uv_stride, + distorted->v_buffer, distorted->uv_stride, distorted_argb, + stride_argb, yuv_constants, width, height); + } else if (ss_x == 0 && ss_y == 0) { + I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer, + source->uv_stride, source->v_buffer, source->uv_stride, + src_argb, stride_argb, yuv_constants, width, height); + I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride, + distorted->u_buffer, distorted->uv_stride, + distorted->v_buffer, distorted->uv_stride, distorted_argb, + stride_argb, yuv_constants, width, height); + } else { + aom_free(src_argb); + aom_free(distorted_argb); + return 0; + } - JxlPixelFormat pixel_format = { 3, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 }; + JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 }; JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL); JxlButteraugliApiSetHFAsymmetry(api, 0.8f); JxlButteraugliResult *result = JxlButteraugliCompute( - api, width, height, &pixel_format, src_rgb, buffer_size, &pixel_format, - distorted_rgb, buffer_size); + api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format, + distorted_argb, buffer_size); const float *distmap = NULL; uint32_t row_stride; @@ -56,8 +90,8 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source, if (distmap == NULL) { JxlButteraugliApiDestroy(api); JxlButteraugliResultDestroy(result); - aom_free(src_rgb); - aom_free(distorted_rgb); + aom_free(src_argb); + aom_free(distorted_argb); return 0; } @@ -69,7 +103,7 @@ int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source, JxlButteraugliApiDestroy(api); JxlButteraugliResultDestroy(result); - aom_free(src_rgb); - aom_free(distorted_rgb); + aom_free(src_argb); + aom_free(distorted_argb); return 1; } diff --git a/third_party/libaom/source/libaom/aom_dsp/butteraugli.h b/third_party/libaom/source/libaom/aom_dsp/butteraugli.h index 06402aa3e4..5304092ccb 100644 --- a/third_party/libaom/source/libaom/aom_dsp/butteraugli.h +++ b/third_party/libaom/source/libaom/aom_dsp/butteraugli.h @@ -14,8 +14,10 @@ #include "aom_scale/yv12config.h" +// Returns a boolean that indicates success/failure. int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, - float *dist_map); + aom_matrix_coefficients_t matrix_coefficients, + aom_color_range_t color_range, float *dist_map); #endif // AOM_AOM_DSP_BUTTERAUGLI_H_ diff --git a/third_party/libaom/source/libaom/aom_dsp/fastssim.c b/third_party/libaom/source/libaom/aom_dsp/fastssim.c index 3804519b31..89712c5f40 100644 --- a/third_party/libaom/source/libaom/aom_dsp/fastssim.c +++ b/third_party/libaom/source/libaom/aom_dsp/fastssim.c @@ -31,6 +31,7 @@ typedef struct fs_ctx fs_ctx; #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) +#define MAX_SSIM_DB 100.0 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) diff --git a/third_party/libaom/source/libaom/aom_dsp/grain_table.c b/third_party/libaom/source/libaom/aom_dsp/grain_table.c index e03f04d5da..b22752abd9 100644 --- a/third_party/libaom/source/libaom/aom_dsp/grain_table.c +++ b/third_party/libaom/source/libaom/aom_dsp/grain_table.c @@ -202,7 +202,7 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, int64_t end_time, int erase, aom_film_grain_t *grain) { aom_film_grain_table_entry_t *entry = t->head; - aom_film_grain_table_entry_t *prev_entry = 0; + aom_film_grain_table_entry_t *prev_entry = NULL; uint16_t random_seed = grain ? grain->random_seed : 0; if (grain) memset(grain, 0, sizeof(*grain)); @@ -241,10 +241,10 @@ int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, entry->end_time = time_stamp; if (t->tail == entry) t->tail = new_entry; } - // If segments aren't aligned, delete from the beggining of subsequent + // If segments aren't aligned, delete from the beginning of subsequent // segments if (end_time > entry_end_time) { - aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0); + aom_film_grain_table_lookup(t, entry_end_time, end_time, 1, 0); } return 1; } @@ -275,12 +275,12 @@ aom_codec_err_t aom_film_grain_table_read( return error_info->error_code; } - aom_film_grain_table_entry_t *prev_entry = 0; + aom_film_grain_table_entry_t *prev_entry = NULL; while (!feof(file)) { aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry)); memset(entry, 0, sizeof(*entry)); grain_table_entry_read(file, error_info, entry); - entry->next = 0; + entry->next = NULL; if (prev_entry) prev_entry->next = entry; if (!t->head) t->head = entry; diff --git a/third_party/libaom/source/libaom/aom_dsp/noise_model.c b/third_party/libaom/source/libaom/aom_dsp/noise_model.c index f56fdd5860..19c660e911 100644 --- a/third_party/libaom/source/libaom/aom_dsp/noise_model.c +++ b/third_party/libaom/source/libaom/aom_dsp/noise_model.c @@ -214,6 +214,7 @@ static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) { int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) { if (!lut) return 0; + if (num_points <= 0) return 0; lut->num_points = 0; lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points)); if (!lut->points) return 0; @@ -1152,12 +1153,24 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, // Convert the scaling functions to 8 bit values aom_noise_strength_lut_t scaling_points[3]; - aom_noise_strength_solver_fit_piecewise( - &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0); - aom_noise_strength_solver_fit_piecewise( - &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1); - aom_noise_strength_solver_fit_piecewise( - &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2); + if (!aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[0].strength_solver, 14, + scaling_points + 0)) { + return 0; + } + if (!aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[1].strength_solver, 10, + scaling_points + 1)) { + aom_noise_strength_lut_free(scaling_points + 0); + return 0; + } + if (!aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[2].strength_solver, 10, + scaling_points + 2)) { + aom_noise_strength_lut_free(scaling_points + 0); + aom_noise_strength_lut_free(scaling_points + 1); + return 0; + } // Both the domain and the range of the scaling functions in the film_grain // are normalized to 8-bit (e.g., they are implicitly scaled during grain diff --git a/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c b/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c index 69a1d99bf2..25f075aa2f 100644 --- a/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c +++ b/third_party/libaom/source/libaom/aom_dsp/psnrhvs.c @@ -34,6 +34,7 @@ static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; } +#if CONFIG_AV1_HIGHBITDEPTH static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) { int i, j; @@ -43,6 +44,7 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, for (j = 0; j < 8; j++) *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; } +#endif // CONFIG_AV1_HIGHBITDEPTH /* Normalized inverse quantization matrix for 8x8 DCT at the point of * transparency. This is not the JPEG based matrix from the paper, @@ -210,6 +212,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, } } s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f; +#if CONFIG_AV1_HIGHBITDEPTH if (!buf_is_hbd) { od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); @@ -217,6 +220,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); } +#else + od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); +#endif // CONFIG_AV1_HIGHBITDEPTH for (i = 0; i < 8; i++) for (j = (i == 0); j < 8; j++) s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j]; diff --git a/third_party/libaom/source/libaom/aom_dsp/ssim.c b/third_party/libaom/source/libaom/aom_dsp/ssim.c index 357da99ae4..c5334fd2c5 100644 --- a/third_party/libaom/source/libaom/aom_dsp/ssim.c +++ b/third_party/libaom/source/libaom/aom_dsp/ssim.c @@ -18,6 +18,7 @@ #include "aom_ports/mem.h" #include "aom_ports/system_state.h" +#if CONFIG_INTERNAL_STATS void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, @@ -33,6 +34,7 @@ void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, } } } +#endif // CONFIG_INTERNAL_STATS void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, @@ -49,24 +51,6 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, } } -#if CONFIG_AV1_HIGHBITDEPTH -void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, - int rp, uint32_t *sum_s, uint32_t *sum_r, - uint32_t *sum_sq_s, uint32_t *sum_sq_r, - uint32_t *sum_sxr) { - int i, j; - for (i = 0; i < 8; i++, s += sp, r += rp) { - for (j = 0; j < 8; j++) { - *sum_s += s[j]; - *sum_r += r[j]; - *sum_sq_s += s[j] * s[j]; - *sum_sq_r += r[j] * r[j]; - *sum_sxr += s[j] * r[j]; - } - } -} -#endif - static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 @@ -78,7 +62,7 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, uint32_t sum_sq_r, uint32_t sum_sxr, int count, uint32_t bd) { double ssim_n, ssim_d; - int64_t c1, c2; + int64_t c1 = 0, c2 = 0; if (bd == 8) { // scale the constants by number of pixels c1 = (cc1 * count * count) >> 12; @@ -90,8 +74,9 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, c1 = (cc1_12 * count * count) >> 12; c2 = (cc2_12 * count * count) >> 12; } else { - c1 = c2 = 0; assert(0); + // Return similarity as zero for unsupported bit-depth values. + return 0; } ssim_n = (2.0 * sum_s * sum_r + c1) * @@ -111,21 +96,11 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); } -static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, - int rp, uint32_t bd, uint32_t shift) { - uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; - aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, - &sum_sxr); - return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), - sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); -} - // We are using a 8x8 moving window with starting location of each 8x8 window // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. -static double aom_ssim2(const uint8_t *img1, const uint8_t *img2, - int stride_img1, int stride_img2, int width, - int height) { +double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, + int stride_img2, int width, int height) { int i, j; int samples = 0; double ssim_total = 0; @@ -143,31 +118,10 @@ static double aom_ssim2(const uint8_t *img1, const uint8_t *img2, return ssim_total; } -static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, - int stride_img1, int stride_img2, int width, - int height, uint32_t bd, uint32_t shift) { - int i, j; - int samples = 0; - double ssim_total = 0; - - // sample point start with each 4x4 location - for (i = 0; i <= height - 8; - i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { - for (j = 0; j <= width - 8; j += 4) { - double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, - CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, - shift); - ssim_total += v; - samples++; - } - } - ssim_total /= samples; - return ssim_total; -} - -void aom_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *weight, - double *fast_ssim) { +#if CONFIG_INTERNAL_STATS +void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + double *fast_ssim) { double abc[3]; for (int i = 0; i < 3; ++i) { const int is_uv = i > 0; @@ -421,7 +375,57 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, m->dssim = dssim_total; return inconsistency_total; } +#endif // CONFIG_INTERNAL_STATS +#if CONFIG_AV1_HIGHBITDEPTH +void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd, uint32_t shift) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); +} + +double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, int height, + uint32_t bd, uint32_t shift) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +#if CONFIG_INTERNAL_STATS void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd, double *fast_ssim) { @@ -455,3 +459,25 @@ void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]); } } +#endif // CONFIG_INTERNAL_STATS +#endif // CONFIG_AV1_HIGHBITDEPTH + +#if CONFIG_INTERNAL_STATS +void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig, + const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth, + const uint32_t in_bit_depth, int is_hbd, double *weight, + double *frame_ssim2) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth, + frame_ssim2); + return; + } +#else + (void)bit_depth; + (void)in_bit_depth; + (void)is_hbd; +#endif // CONFIG_AV1_HIGHBITDEPTH + aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2); +} +#endif // CONFIG_INTERNAL_STATS diff --git a/third_party/libaom/source/libaom/aom_dsp/ssim.h b/third_party/libaom/source/libaom/aom_dsp/ssim.h index d635ef5bbe..fb92556a8c 100644 --- a/third_party/libaom/source/libaom/aom_dsp/ssim.h +++ b/third_party/libaom/source/libaom/aom_dsp/ssim.h @@ -12,14 +12,13 @@ #ifndef AOM_AOM_DSP_SSIM_H_ #define AOM_AOM_DSP_SSIM_H_ -#define MAX_SSIM_DB 100.0; - #ifdef __cplusplus extern "C" { #endif #include "config/aom_config.h" +#if CONFIG_INTERNAL_STATS #include "aom_scale/yv12config.h" // metrics used for calculating ssim, ssim2, dssim, and ssimc @@ -68,18 +67,35 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch, int width, int height, Ssimv *sv2, Metrics *m, int do_inconsistency); -void aom_calc_ssim(const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *dest, double *weight, - double *fast_ssim); +void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + double *fast_ssim); double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, double *ssim_v, uint32_t bd, uint32_t in_bd); +#if CONFIG_AV1_HIGHBITDEPTH void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd, double *fast_ssim); +#endif // CONFIG_AV1_HIGHBITDEPTH + +void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig, + const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth, + const uint32_t in_bit_depth, int is_hbd, double *weight, + double *frame_ssim2); +#endif // CONFIG_INTERNAL_STATS + +double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, + int stride_img2, int width, int height); + +#if CONFIG_AV1_HIGHBITDEPTH +double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, int height, + uint32_t bd, uint32_t shift); +#endif // CONFIG_AV1_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/third_party/libaom/source/libaom/aom_dsp/vmaf.c b/third_party/libaom/source/libaom/aom_dsp/vmaf.c index 41653430c1..219e278303 100644 --- a/third_party/libaom/source/libaom/aom_dsp/vmaf.c +++ b/third_party/libaom/source/libaom/aom_dsp/vmaf.c @@ -12,9 +12,6 @@ #include "aom_dsp/vmaf.h" #include <assert.h> -#if !CONFIG_USE_VMAF_RC -#include <libvmaf.h> -#endif #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -24,10 +21,7 @@ #include <unistd.h> #endif -#if CONFIG_USE_VMAF_RC -#include <libvmaf/libvmaf.rc.h> -#endif - +#include <libvmaf/libvmaf.h> #include "aom_dsp/blend.h" #include "aom_ports/system_state.h" @@ -36,162 +30,18 @@ static void vmaf_fatal_error(const char *message) { exit(EXIT_FAILURE); } -#if !CONFIG_USE_VMAF_RC -typedef struct FrameData { - const YV12_BUFFER_CONFIG *source; - const YV12_BUFFER_CONFIG *distorted; - int frame_set; - int bit_depth; -} FrameData; - -// A callback function used to pass data to VMAF. -// Returns 0 after reading a frame. -// Returns 2 when there is no more frame to read. -static int read_frame(float *ref_data, float *main_data, float *temp_data, - int stride, void *user_data) { - FrameData *frames = (FrameData *)user_data; - - if (!frames->frame_set) { - const int width = frames->source->y_width; - const int height = frames->source->y_height; - assert(width == frames->distorted->y_width); - assert(height == frames->distorted->y_height); - - if (frames->source->flags & YV12_FLAG_HIGHBITDEPTH) { - const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8)); - uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer); - uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer); - - for (int row = 0; row < height; ++row) { - for (int col = 0; col < width; ++col) { - ref_data[col] = scale_factor * (float)ref_ptr[col]; - } - ref_ptr += frames->source->y_stride; - ref_data += stride / sizeof(*ref_data); - } - - for (int row = 0; row < height; ++row) { - for (int col = 0; col < width; ++col) { - main_data[col] = scale_factor * (float)main_ptr[col]; - } - main_ptr += frames->distorted->y_stride; - main_data += stride / sizeof(*main_data); - } - } else { - uint8_t *ref_ptr = frames->source->y_buffer; - uint8_t *main_ptr = frames->distorted->y_buffer; - - for (int row = 0; row < height; ++row) { - for (int col = 0; col < width; ++col) { - ref_data[col] = (float)ref_ptr[col]; - } - ref_ptr += frames->source->y_stride; - ref_data += stride / sizeof(*ref_data); - } - - for (int row = 0; row < height; ++row) { - for (int col = 0; col < width; ++col) { - main_data[col] = (float)main_ptr[col]; - } - main_ptr += frames->distorted->y_stride; - main_data += stride / sizeof(*main_data); - } - } - frames->frame_set = 1; - return 0; - } - - (void)temp_data; - return 2; -} - -void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *distorted, const int bit_depth, - double *const vmaf) { - aom_clear_system_state(); - const int width = source->y_width; - const int height = source->y_height; - FrameData frames = { source, distorted, 0, bit_depth }; - char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p"; - double vmaf_score; - const int ret = - compute_vmaf(&vmaf_score, fmt, width, height, read_frame, - /*user_data=*/&frames, (char *)model_path, - /*log_path=*/NULL, /*log_fmt=*/NULL, /*disable_clip=*/1, - /*disable_avx=*/0, /*enable_transform=*/0, - /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0, - /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0, - /*n_subsample=*/1, /*enable_conf_interval=*/0); - if (ret) vmaf_fatal_error("Failed to compute VMAF scores."); - - aom_clear_system_state(); - *vmaf = vmaf_score; -} - -void aom_calc_vmaf_multi_frame(void *user_data, const char *model_path, - int (*rd_frm)(float *ref_data, float *main_data, - float *temp_data, int stride_byte, - void *user_data), - int frame_width, int frame_height, int bit_depth, - double *vmaf) { - aom_clear_system_state(); - - char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p"; - int log_path_length = snprintf(NULL, 0, "vmaf_scores_%d.xml", getpid()) + 1; - char *log_path = malloc(log_path_length); - snprintf(log_path, log_path_length, "vmaf_scores_%d.xml", getpid()); - double vmaf_score; - const int ret = - compute_vmaf(&vmaf_score, fmt, frame_width, frame_height, rd_frm, - /*user_data=*/user_data, (char *)model_path, - /*log_path=*/log_path, /*log_fmt=*/NULL, /*disable_clip=*/0, - /*disable_avx=*/0, /*enable_transform=*/0, - /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0, - /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0, - /*n_subsample=*/1, /*enable_conf_interval=*/0); - FILE *vmaf_log = fopen(log_path, "r"); - free(log_path); - log_path = NULL; - if (vmaf_log == NULL || ret) { - vmaf_fatal_error("Failed to compute VMAF scores."); - } - - int frame_index = 0; - char buf[512]; - while (fgets(buf, 511, vmaf_log) != NULL) { - if (memcmp(buf, "\t\t<frame ", 9) == 0) { - char *p = strstr(buf, "vmaf="); - if (p != NULL && p[5] == '"') { - char *p2 = strstr(&p[6], "\""); - *p2 = '\0'; - const double score = atof(&p[6]); - if (score < 0.0 || score > 100.0) { - vmaf_fatal_error("Failed to compute VMAF scores."); - } - vmaf[frame_index++] = score; - } - } - } - fclose(vmaf_log); - - aom_clear_system_state(); -} -#endif - -#if CONFIG_USE_VMAF_RC -void aom_init_vmaf_model_rc(VmafModel **vmaf_model, const char *model_path) { +void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path) { if (*vmaf_model != NULL) return; VmafModelConfig model_cfg; model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP; model_cfg.name = "vmaf"; - model_cfg.path = (char *)model_path; - if (vmaf_model_load_from_path(vmaf_model, &model_cfg)) { + if (vmaf_model_load_from_path(vmaf_model, &model_cfg, model_path)) { vmaf_fatal_error("Failed to load VMAF model."); } } -void aom_close_vmaf_model_rc(VmafModel *vmaf_model) { +void aom_close_vmaf_model(VmafModel *vmaf_model) { vmaf_model_destroy(vmaf_model); } @@ -221,8 +71,9 @@ static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src, } } -void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model, - bool cal_vmaf_neg) { +void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model, + bool cal_vmaf_neg) { + // TODO(sdeng): make them CLI arguments. VmafConfiguration cfg; cfg.log_level = VMAF_LOG_LEVEL_NONE; cfg.n_threads = 0; @@ -233,41 +84,53 @@ void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model, vmaf_fatal_error("Failed to init VMAF context."); } - if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) { - vmaf_fatal_error("Failed to load feature extractors from VMAF model."); - } - if (cal_vmaf_neg) { VmafFeatureDictionary *vif_feature = NULL; - vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit", "1.0"); - if (vmaf_use_feature(*vmaf_context, "float_vif", vif_feature)) { + if (vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit", + "1.0")) { + vmaf_fatal_error("Failed to set vif_enhn_gain_limit."); + } + if (vmaf_model_feature_overload(vmaf_model, "float_vif", vif_feature)) { vmaf_fatal_error("Failed to use feature float_vif."); } VmafFeatureDictionary *adm_feature = NULL; - vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit", "1.0"); - if (vmaf_use_feature(*vmaf_context, "float_adm", adm_feature)) { + if (vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit", + "1.0")) { + vmaf_fatal_error("Failed to set adm_enhn_gain_limit."); + } + if (vmaf_model_feature_overload(vmaf_model, "adm", adm_feature)) { vmaf_fatal_error("Failed to use feature float_adm."); } } VmafFeatureDictionary *motion_force_zero = NULL; - vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero", "true"); - if (vmaf_use_feature(*vmaf_context, "float_motion", motion_force_zero)) { + if (vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero", + "1")) { + vmaf_fatal_error("Failed to set motion_force_zero."); + } + if (vmaf_model_feature_overload(vmaf_model, "float_motion", + motion_force_zero)) { vmaf_fatal_error("Failed to use feature float_motion."); } + + if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) { + vmaf_fatal_error("Failed to load feature extractors from VMAF model."); + } } -void aom_close_vmaf_context_rc(VmafContext *vmaf_context) { +void aom_close_vmaf_context(VmafContext *vmaf_context) { if (vmaf_close(vmaf_context)) { vmaf_fatal_error("Failed to close VMAF context."); } } -void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model, - const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *distorted, - int bit_depth, int frame_index, double *vmaf) { +void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + bool cal_vmaf_neg, double *vmaf) { + VmafContext *vmaf_context; + aom_init_vmaf_context(&vmaf_context, vmaf_model, cal_vmaf_neg); + const int frame_index = 0; VmafPicture ref, dist; if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width, source->y_height) || @@ -282,10 +145,50 @@ void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model, vmaf_fatal_error("Failed to read VMAF pictures."); } + if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) { + vmaf_fatal_error("Failed to flush context."); + } + vmaf_picture_unref(&ref); vmaf_picture_unref(&dist); vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index); + aom_close_vmaf_context(vmaf_context); } -#endif // CONFIG_USE_VMAF_RC +void aom_read_vmaf_image(VmafContext *vmaf_context, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + int frame_index) { + VmafPicture ref, dist; + if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width, + source->y_height) || + vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth, + source->y_width, source->y_height)) { + vmaf_fatal_error("Failed to alloc VMAF pictures."); + } + copy_picture(bit_depth, source, &ref); + copy_picture(bit_depth, distorted, &dist); + if (vmaf_read_pictures(vmaf_context, &ref, &dist, + /*picture index=*/frame_index)) { + vmaf_fatal_error("Failed to read VMAF pictures."); + } + + vmaf_picture_unref(&ref); + vmaf_picture_unref(&dist); +} + +double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model, + int frame_index) { + double vmaf; + if (vmaf_score_at_index(vmaf_context, vmaf_model, &vmaf, frame_index)) { + vmaf_fatal_error("Failed to calc VMAF scores."); + } + return vmaf; +} + +void aom_flush_vmaf_context(VmafContext *vmaf_context) { + if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) { + vmaf_fatal_error("Failed to flush context."); + } +} diff --git a/third_party/libaom/source/libaom/aom_dsp/vmaf.h b/third_party/libaom/source/libaom/aom_dsp/vmaf.h index d9da223e29..3ba8c8d565 100644 --- a/third_party/libaom/source/libaom/aom_dsp/vmaf.h +++ b/third_party/libaom/source/libaom/aom_dsp/vmaf.h @@ -15,33 +15,28 @@ #include <stdbool.h> #include "aom_scale/yv12config.h" -#if CONFIG_USE_VMAF_RC typedef struct VmafContext VmafContext; typedef struct VmafModel VmafModel; -#endif - -#if CONFIG_USE_VMAF_RC -void aom_init_vmaf_context_rc(VmafContext **vmaf_context, VmafModel *vmaf_model, - bool cal_vmaf_neg); -void aom_close_vmaf_context_rc(VmafContext *vmaf_context); - -void aom_init_vmaf_model_rc(VmafModel **vmaf_model, const char *model_path); -void aom_close_vmaf_model_rc(VmafModel *vmaf_model); - -void aom_calc_vmaf_at_index_rc(VmafContext *vmaf_context, VmafModel *vmaf_model, - const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *distorted, - int bit_depth, int frame_index, double *vmaf); -#else -void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source, + +void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model, + bool cal_vmaf_neg); +void aom_close_vmaf_context(VmafContext *vmaf_context); + +void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path); +void aom_close_vmaf_model(VmafModel *vmaf_model); + +void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *distorted, int bit_depth, - double *vmaf); - -void aom_calc_vmaf_multi_frame( - void *user_data, const char *model_path, - int (*read_frame)(float *ref_data, float *main_data, float *temp_data, - int stride_byte, void *user_data), - int frame_width, int frame_height, int bit_depth, double *vmaf); -#endif // CONFIG_USE_VMAF_RC + bool cal_vmaf_neg, double *vmaf); + +void aom_read_vmaf_image(VmafContext *vmaf_context, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *distorted, int bit_depth, + int frame_index); + +double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model, + int frame_index); + +void aom_flush_vmaf_context(VmafContext *vmaf_context); #endif // AOM_AOM_DSP_VMAF_H_ diff --git a/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm index 58f1ac964e..a2510d5e7f 100644 --- a/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm +++ b/third_party/libaom/source/libaom/aom_dsp/x86/highbd_sad_sse2.asm @@ -20,20 +20,21 @@ SECTION .text ; Arg 2: Height ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows -%macro HIGH_SAD_FN 4 +; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7 +%macro HIGH_SAD_FN 4-5 7 %if %4 == 0 %if %3 == 5 -cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 -cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ +cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %elif %4 == 1 ; avg %if %3 == 5 -cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 -cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ +cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, %5, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 @@ -356,7 +357,7 @@ HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD8XN 1-2 0 - HIGH_SAD_FN 8, %1, 7, %2 + HIGH_SAD_FN 8, %1, 7, %2, 8 %if %2 == 2 ; skip rows, so divide number of rows by 2 mov n_rowsd, %1/8 %else @@ -377,22 +378,30 @@ HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 pavgw m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] + mova m7, m1 + movu m5, [srcq] + psubusw m1, m5 + psubusw m5, m7 por m1, m5 - mova m5, [srcq+src_strideq*2] - psubusw m5, m2 - psubusw m2, [srcq+src_strideq*2] + + mova m7, m2 + movu m5, [srcq+src_strideq*2] + psubusw m2, m5 + psubusw m5, m7 por m2, m5 - mova m5, [srcq+src_strideq*4] - psubusw m5, m3 - psubusw m3, [srcq+src_strideq*4] + + mova m7, m3 + movu m5, [srcq+src_strideq*4] + psubusw m3, m5 + psubusw m5, m7 por m3, m5 - mova m5, [srcq+src_stride3q*2] - psubusw m5, m4 - psubusw m4, [srcq+src_stride3q*2] + + mova m7, m4 + movu m5, [srcq+src_stride3q*2] + psubusw m4, m5 + psubusw m5, m7 por m4, m5 + paddw m1, m2 paddw m3, m4 movhlps m2, m1 diff --git a/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c b/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c index f779270ae3..163e4cc566 100644 --- a/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c +++ b/third_party/libaom/source/libaom/aom_dsp/x86/variance_impl_avx2.c @@ -616,7 +616,7 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2( src += src_stride; dst += dst_stride; } - } else if (y_offset == 8) { + } else if (y_offset == 4) { __m256i src_next_reg; for (i = 0; i < height; i++) { LOAD_SRC_DST @@ -652,8 +652,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2( dst += dst_stride; } } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { if (y_offset == 0) { __m256i src_next_reg; for (i = 0; i < height; i++) { @@ -668,8 +668,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2( src += src_stride; dst += dst_stride; } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { __m256i src_next_reg, src_avg; // load source and another source starting from the next // following byte @@ -691,7 +691,7 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2( CALC_SUM_SSE_INSIDE_LOOP dst += dst_stride; } - // x_offset = 8 and y_offset = bilin interpolation + // x_offset = 4 and y_offset = bilin interpolation } else { __m256i filter, pw8, src_next_reg, src_avg; y_offset <<= 5; @@ -741,8 +741,8 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2( src += src_stride; dst += dst_stride; } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { __m256i filter, pw8, src_next_reg, src_pack; x_offset <<= 5; filter = _mm256_load_si256( diff --git a/third_party/libaom/source/libaom/apps/aomenc.c b/third_party/libaom/source/libaom/apps/aomenc.c index 11035bf129..c09c3ca9c2 100644 --- a/third_party/libaom/source/libaom/apps/aomenc.c +++ b/third_party/libaom/source/libaom/apps/aomenc.c @@ -227,6 +227,8 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED, #if CONFIG_TUNE_VMAF AV1E_SET_VMAF_MODEL_PATH, #endif + AV1E_SET_DV_COST_UPD_FREQ, + AV1E_SET_PARTITION_INFO_PATH, 0 }; const arg_def_t *main_args[] = { &g_av1_codec_arg_defs.help, @@ -422,6 +424,8 @@ const arg_def_t *av1_ctrl_args[] = { #if CONFIG_TUNE_VMAF &g_av1_codec_arg_defs.vmaf_model_path, #endif + &g_av1_codec_arg_defs.dv_cost_upd_freq, + &g_av1_codec_arg_defs.partition_info_path, NULL, }; @@ -505,6 +509,7 @@ struct stream_config { #if CONFIG_TUNE_VMAF const char *vmaf_model_path; #endif + const char *partition_info_path; aom_color_range_t color_range; }; @@ -681,6 +686,8 @@ static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) { if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) { warn("Enforcing one-pass encoding in realtime mode\n"); + if (global->pass > 1) + die("Error: Invalid --pass=%d for one-pass encoding\n", global->pass); global->passes = 1; } @@ -853,9 +860,9 @@ static void set_config_arg_key_vals(struct stream_config *config, } /* Point either to the next free element or the first instance of this - * control. + * option. */ - for (j = 0; j < config->arg_ctrl_cnt; j++) + for (j = 0; j < config->arg_key_val_cnt; j++) if (strcmp(name, config->arg_key_vals[j][0]) == 0) break; /* Update/insert */ @@ -1071,6 +1078,9 @@ static int parse_stream_params(struct AvxEncoderConfig *global, } else if (arg_match(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argi)) { config->vmaf_model_path = arg.val; #endif + } else if (arg_match(&arg, &g_av1_codec_arg_defs.partition_info_path, + argi)) { + config->partition_info_path = arg.val; } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_fixed_qp_offsets, argi)) { config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg); @@ -1078,9 +1088,14 @@ static int parse_stream_params(struct AvxEncoderConfig *global, const int fixed_qp_offset_count = arg_parse_list( &arg, config->cfg.fixed_qp_offsets, FIXED_QP_OFFSET_COUNT); if (fixed_qp_offset_count < FIXED_QP_OFFSET_COUNT) { - die("Option --fixed_qp_offsets requires %d comma-separated values, but " - "only %d values were provided.\n", - FIXED_QP_OFFSET_COUNT, fixed_qp_offset_count); + if (fixed_qp_offset_count < 2) { + die("Option --fixed_qp_offsets requires at least 2 comma-separated " + "values for kf and arf, but only %d were provided.\n", + fixed_qp_offset_count); + } + for (int k = fixed_qp_offset_count; k < FIXED_QP_OFFSET_COUNT; ++k) + config->cfg.fixed_qp_offsets[k] = + (config->cfg.fixed_qp_offsets[k - 1] + 1) / 2; } config->cfg.use_fixed_qp_offsets = 1; } else if (global->usage == AOM_USAGE_REALTIME && @@ -1301,7 +1316,6 @@ static void show_stream_config(struct stream_state *stream, SHOW_PARAMS(disable_intrabc); SHOW_PARAMS(disable_cfl); SHOW_PARAMS(disable_smooth_intra); - SHOW_PARAMS(disable_diagonal_intra); SHOW_PARAMS(disable_filter_intra); SHOW_PARAMS(disable_dual_filter); SHOW_PARAMS(disable_intra_angle_delta); @@ -1437,6 +1451,11 @@ static void initialize_encoder(struct stream_state *stream, stream->config.vmaf_model_path); } #endif + if (stream->config.partition_info_path) { + AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, + AV1E_SET_PARTITION_INFO_PATH, + stream->config.partition_info_path); + } if (stream->config.film_grain_filename) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE, @@ -1473,6 +1492,33 @@ static void initialize_encoder(struct stream_state *stream, #endif } +// Convert the input image 'img' to a monochrome image. The Y plane of the +// output image is a shallow copy of the Y plane of the input image, therefore +// the input image must remain valid for the lifetime of the output image. The U +// and V planes of the output image are set to null pointers. The output image +// format is AOM_IMG_FMT_I420 because libaom does not have AOM_IMG_FMT_I400. +static void convert_image_to_monochrome(const struct aom_image *img, + struct aom_image *monochrome_img) { + *monochrome_img = *img; + monochrome_img->fmt = AOM_IMG_FMT_I420; + if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + monochrome_img->fmt |= AOM_IMG_FMT_HIGHBITDEPTH; + } + monochrome_img->monochrome = 1; + monochrome_img->csp = AOM_CSP_UNKNOWN; + monochrome_img->x_chroma_shift = 1; + monochrome_img->y_chroma_shift = 1; + monochrome_img->planes[AOM_PLANE_U] = NULL; + monochrome_img->planes[AOM_PLANE_V] = NULL; + monochrome_img->stride[AOM_PLANE_U] = 0; + monochrome_img->stride[AOM_PLANE_V] = 0; + monochrome_img->sz = 0; + monochrome_img->bps = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; + monochrome_img->img_data = NULL; + monochrome_img->img_data_owner = 0; + monochrome_img->self_allocd = 0; +} + static void encode_frame(struct stream_state *stream, struct AvxEncoderConfig *global, struct aom_image *img, unsigned int frames_in) { @@ -1552,6 +1598,12 @@ static void encode_frame(struct stream_state *stream, #endif } + struct aom_image monochrome_img; + if (img && cfg->monochrome) { + convert_image_to_monochrome(img, &monochrome_img); + img = &monochrome_img; + } + aom_usec_timer_start(&timer); aom_codec_encode(&stream->encoder, img, frame_start, (uint32_t)(next_frame_start - frame_start), 0); @@ -1941,8 +1993,10 @@ int main(int argc, const char **argv_) { stream->config.cfg.g_profile = 1; profile_updated = 1; } - } else if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 || - input.fmt == AOM_IMG_FMT_I42216) { + } else if (input.bit_depth == 12 || + ((input.fmt == AOM_IMG_FMT_I422 || + input.fmt == AOM_IMG_FMT_I42216) && + !stream->config.cfg.monochrome)) { stream->config.cfg.g_profile = 2; profile_updated = 1; } diff --git a/third_party/libaom/source/libaom/av1/arg_defs.c b/third_party/libaom/source/libaom/av1/arg_defs.c index e79f9b2934..8646b09c9d 100644 --- a/third_party/libaom/source/libaom/av1/arg_defs.c +++ b/third_party/libaom/source/libaom/av1/arg_defs.c @@ -271,7 +271,9 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = { .noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"), .sharpness = ARG_DEF(NULL, "sharpness", 1, - "Loop filter sharpness (0..7), default is 0"), + "Bias towards block sharpness in rate-distortion " + "optimization of transform coefficients " + "(0..7), default is 0"), .static_thresh = ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"), .auto_altref = @@ -448,13 +450,16 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = { "Use Default-transform only for INTRA modes"), .quant_b_adapt = ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"), .coeff_cost_upd_freq = ARG_DEF(NULL, "coeff-cost-upd-freq", 1, - "Update freq for coeff costs" + "Update freq for coeff costs. " "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), .mode_cost_upd_freq = ARG_DEF(NULL, "mode-cost-upd-freq", 1, - "Update freq for mode costs" + "Update freq for mode costs. " "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), .mv_cost_upd_freq = ARG_DEF(NULL, "mv-cost-upd-freq", 1, - "Update freq for mv costs" + "Update freq for mv costs. " + "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), + .dv_cost_upd_freq = ARG_DEF(NULL, "dv-cost-upd-freq", 1, + "Update freq for dv costs. " "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"), .num_tg = ARG_DEF(NULL, "num-tile-groups", 1, "Maximum number of tile groups, default is 1"), @@ -471,6 +476,8 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = { .vmaf_model_path = ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file"), #endif + .partition_info_path = ARG_DEF(NULL, "partition-info-path", 1, + "Partition information read and write path"), .film_grain_test = ARG_DEF( NULL, "film-grain-test", 1, "Film grain test vectors (0: none (default), 1: test-1 2: test-2, " @@ -592,7 +599,9 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = { "pyramid. Selected automatically from --cq-level if " "--fixed-qp-offsets is not provided. If this option is not " "specified (default), offsets are adaptively chosen by the " - "encoder."), + "encoder. Further, if this option is specified, at least two " + "comma-separated values corresponding to kf and arf offsets " + "must be provided, while the rest are chosen by the encoder"), .fixed_qp_offsets = ARG_DEF( NULL, "fixed-qp-offsets", 1, @@ -605,6 +614,6 @@ const av1_codec_arg_definitions_t g_av1_codec_arg_defs = { .vbr_corpus_complexity_lap = ARG_DEF( NULL, "vbr-corpus-complexity-lap", 1, "Set average corpus complexity per mb for single pass VBR using lap. " - "(0..10000), default is 0") + "(0..10000), default is 0"), #endif // CONFIG_AV1_ENCODER }; diff --git a/third_party/libaom/source/libaom/av1/arg_defs.h b/third_party/libaom/source/libaom/av1/arg_defs.h index f86e91551c..6a8d0d47cf 100644 --- a/third_party/libaom/source/libaom/av1/arg_defs.h +++ b/third_party/libaom/source/libaom/av1/arg_defs.h @@ -173,12 +173,14 @@ typedef struct av1_codec_arg_definitions { arg_def_t coeff_cost_upd_freq; arg_def_t mode_cost_upd_freq; arg_def_t mv_cost_upd_freq; + arg_def_t dv_cost_upd_freq; arg_def_t num_tg; arg_def_t mtu_size; arg_def_t timing_info; #if CONFIG_TUNE_VMAF arg_def_t vmaf_model_path; #endif + arg_def_t partition_info_path; arg_def_t film_grain_test; arg_def_t film_grain_table; #if CONFIG_DENOISE diff --git a/third_party/libaom/source/libaom/av1/av1_cx_iface.c b/third_party/libaom/source/libaom/av1/av1_cx_iface.c index 123bb1dc41..11c47bca24 100644 --- a/third_party/libaom/source/libaom/av1/av1_cx_iface.c +++ b/third_party/libaom/source/libaom/av1/av1_cx_iface.c @@ -26,6 +26,7 @@ #include "av1/encoder/bitstream.h" #include "av1/encoder/encoder.h" #include "av1/encoder/ethread.h" +#include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" #include "av1/arg_defs.h" @@ -51,6 +52,7 @@ struct av1_extracfg { unsigned int gf_max_pyr_height; aom_tune_metric tuning; const char *vmaf_model_path; + const char *partition_info_path; unsigned int cq_level; // constrained quality level unsigned int rc_max_intra_bitrate_pct; unsigned int rc_max_inter_bitrate_pct; @@ -154,12 +156,26 @@ struct av1_extracfg { COST_UPDATE_TYPE coeff_cost_upd_freq; COST_UPDATE_TYPE mode_cost_upd_freq; COST_UPDATE_TYPE mv_cost_upd_freq; + COST_UPDATE_TYPE dv_cost_upd_freq; unsigned int ext_tile_debug; unsigned int sb_multipass_unit_test; }; +#if CONFIG_REALTIME_ONLY +// Settings changed for realtime only build: +// cpu_used: 7 +// enable_tpl_model: 0 +// enable_restoration: 0 +// enable_obmc: 0 +// deltaq_mode: NO_DELTA_Q +// enable_global_motion usage: 0 +// enable_warped_motion at sequence level: 0 +// allow_warped_motion at frame level: 0 +// coeff_cost_upd_freq: COST_UPD_OFF +// mode_cost_upd_freq: COST_UPD_OFF +// mv_cost_upd_freq: COST_UPD_OFF static struct av1_extracfg default_extra_cfg = { - 0, // cpu_used + 7, // cpu_used 1, // enable_auto_alt_ref 0, // enable_auto_bwd_ref 0, // noise_sensitivity @@ -168,7 +184,7 @@ static struct av1_extracfg default_extra_cfg = { 1, // row_mt 0, // tile_columns 0, // tile_rows - 1, // enable_tpl_model + 0, // enable_tpl_model 1, // enable_keyframe_filtering 7, // arnr_max_frames 5, // arnr_strength @@ -177,31 +193,32 @@ static struct av1_extracfg default_extra_cfg = { 0, // gf_min_pyr_height 5, // gf_max_pyr_height AOM_TUNE_PSNR, // tuning - "/usr/local/share/model/vmaf_v0.6.1.pkl", // VMAF model path - 10, // cq_level - 0, // rc_max_intra_bitrate_pct - 0, // rc_max_inter_bitrate_pct - 0, // gf_cbr_boost_pct - 0, // lossless - 1, // enable_cdef - 1, // enable_restoration - 0, // force_video_mode - 1, // enable_obmc - 3, // disable_trellis_quant - 0, // enable_qm - DEFAULT_QM_Y, // qm_y - DEFAULT_QM_U, // qm_u - DEFAULT_QM_V, // qm_v - DEFAULT_QM_FIRST, // qm_min - DEFAULT_QM_LAST, // qm_max - 1, // max number of tile groups - 0, // mtu_size + "/usr/local/share/model/vmaf_v0.6.1.json", // VMAF model path + ".", // partition info path + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // rc_max_inter_bitrate_pct + 0, // gf_cbr_boost_pct + 0, // lossless + 1, // enable_cdef + 0, // enable_restoration + 0, // force_video_mode + 0, // enable_obmc + 3, // disable_trellis_quant + 0, // enable_qm + DEFAULT_QM_Y, // qm_y + DEFAULT_QM_U, // qm_u + DEFAULT_QM_V, // qm_v + DEFAULT_QM_FIRST, // qm_min + DEFAULT_QM_LAST, // qm_max + 1, // max number of tile groups + 0, // mtu_size AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream 0, // frame_parallel_decoding_mode 1, // enable dual filter 0, // enable delta quant in chroma planes NO_AQ, // aq_mode - DELTA_Q_OBJECTIVE, // deltaq_mode + NO_DELTA_Q, // deltaq_mode 0, // delta lf mode 0, // frame_periodic_boost AOM_BITS_8, // Bit depth @@ -243,9 +260,9 @@ static struct av1_extracfg default_extra_cfg = { 1, // enable difference-weighted compound 1, // enable interinter wedge compound 1, // enable interintra wedge compound - 1, // enable_global_motion usage - 1, // enable_warped_motion at sequence level - 1, // allow_warped_motion at frame level + 0, // enable_global_motion usage + 0, // enable_warped_motion at sequence level + 0, // allow_warped_motion at frame level 1, // enable filter intra at sequence level 1, // enable smooth intra modes usage for sequence 1, // enable Paeth intra mode usage for sequence @@ -277,15 +294,148 @@ static struct av1_extracfg default_extra_cfg = { SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + }, // target_seq_level_idx + 0, // tier_mask + 0, // min_cr + COST_UPD_OFF, // coeff_cost_upd_freq + COST_UPD_OFF, // mode_cost_upd_freq + COST_UPD_OFF, // mv_cost_upd_freq + COST_UPD_OFF, // dv_cost_upd_freq + 0, // ext_tile_debug + 0, // sb_multipass_unit_test +}; +#else +static struct av1_extracfg default_extra_cfg = { + 0, // cpu_used + 1, // enable_auto_alt_ref + 0, // enable_auto_bwd_ref + 0, // noise_sensitivity + 0, // sharpness + 0, // static_thresh + 1, // row_mt + 0, // tile_columns + 0, // tile_rows + 1, // enable_tpl_model + 1, // enable_keyframe_filtering + 7, // arnr_max_frames + 5, // arnr_strength + 0, // min_gf_interval; 0 -> default decision + 0, // max_gf_interval; 0 -> default decision + 0, // gf_min_pyr_height + 5, // gf_max_pyr_height + AOM_TUNE_PSNR, // tuning + "/usr/local/share/model/vmaf_v0.6.1.json", // VMAF model path + ".", // partition info path + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // rc_max_inter_bitrate_pct + 0, // gf_cbr_boost_pct + 0, // lossless + 1, // enable_cdef + 1, // enable_restoration + 0, // force_video_mode + 1, // enable_obmc + 3, // disable_trellis_quant + 0, // enable_qm + DEFAULT_QM_Y, // qm_y + DEFAULT_QM_U, // qm_u + DEFAULT_QM_V, // qm_v + DEFAULT_QM_FIRST, // qm_min + DEFAULT_QM_LAST, // qm_max + 1, // max number of tile groups + 0, // mtu_size + AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream + 0, // frame_parallel_decoding_mode + 1, // enable dual filter + 0, // enable delta quant in chroma planes + NO_AQ, // aq_mode + DELTA_Q_OBJECTIVE, // deltaq_mode + 0, // delta lf mode + 0, // frame_periodic_boost + AOM_BITS_8, // Bit depth + AOM_CONTENT_DEFAULT, // content + AOM_CICP_CP_UNSPECIFIED, // CICP color primaries + AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics + AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients + AOM_CSP_UNKNOWN, // chroma sample position + 0, // color range + 0, // render width + 0, // render height + AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size + 1, // this depends on large_scale_tile. + 0, // error_resilient_mode off by default. + 0, // s_frame_mode off by default. + 0, // film_grain_test_vector + 0, // film_grain_table_filename + 0, // motion_vector_unit_test + 1, // CDF update mode + 1, // enable rectangular partitions + 1, // enable ab shape partitions + 1, // enable 1:4 and 4:1 partitions + 4, // min_partition_size + 128, // max_partition_size + 1, // enable intra edge filter + 1, // frame order hint + 1, // enable 64-pt transform usage + 1, // enable flip and identity transform + 1, // enable rectangular transform usage + 1, // dist-wtd compound + 7, // max_reference_frames + 0, // enable_reduced_reference_set + 1, // enable_ref_frame_mvs sequence level + 1, // allow ref_frame_mvs frame level + 1, // enable masked compound at sequence level + 1, // enable one sided compound at sequence level + 1, // enable interintra compound at sequence level + 1, // enable smooth interintra mode + 1, // enable difference-weighted compound + 1, // enable interinter wedge compound + 1, // enable interintra wedge compound + 1, // enable_global_motion usage + 1, // enable_warped_motion at sequence level + 1, // allow_warped_motion at frame level + 1, // enable filter intra at sequence level + 1, // enable smooth intra modes usage for sequence + 1, // enable Paeth intra mode usage for sequence + 1, // enable CFL uv intra mode usage for sequence + 1, // enable D45 to D203 intra mode usage for sequence + 1, // superres + 1, // enable overlay + 1, // enable palette + !CONFIG_SHARP_SETTINGS, // enable intrabc + 1, // enable angle delta +#if CONFIG_DENOISE + 0, // noise_level + 32, // noise_block_size + 1, // enable_dnl_denoising +#endif + 0, // chroma_subsampling_x + 0, // chroma_subsampling_y + 0, // reduced_tx_type_set + 0, // use_intra_dct_only + 0, // use_inter_dct_only + 0, // use_intra_default_tx_only + 0, // quant_b_adapt + 0, // vbr_corpus_complexity_lap + { + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, + SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, }, // target_seq_level_idx 0, // tier_mask 0, // min_cr COST_UPD_SB, // coeff_cost_upd_freq COST_UPD_SB, // mode_cost_upd_freq COST_UPD_SB, // mv_cost_upd_freq + COST_UPD_SB, // dv_cost_upd_freq 0, // ext_tile_debug 0, // sb_multipass_unit_test }; +#endif struct aom_codec_alg_priv { aom_codec_priv_t base; @@ -380,7 +530,11 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1); RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1); RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1); - RANGE_CHECK_HI(cfg, g_usage, 2); +#if CONFIG_REALTIME_ONLY + RANGE_CHECK(cfg, g_usage, AOM_USAGE_REALTIME, AOM_USAGE_REALTIME); +#else + RANGE_CHECK_HI(cfg, g_usage, AOM_USAGE_ALL_INTRA); +#endif RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q); RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); @@ -540,15 +694,6 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, } #endif -#if !CONFIG_USE_VMAF_RC - if (extra_cfg->tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { - ERROR( - "This error may be related to the wrong configuration options: try to " - "set -DCONFIG_TUNE_VMAF=1 and -DCONFIG_USE_VMAF_RC=1 at the time CMake" - " is run."); - } -#endif - RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_BUTTERAUGLI); RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED, @@ -572,6 +717,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 3); RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 3); RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3); + RANGE_CHECK(extra_cfg, dv_cost_upd_freq, 0, 3); RANGE_CHECK(extra_cfg, min_partition_size, 4, 128); RANGE_CHECK(extra_cfg, max_partition_size, 4, 128); @@ -619,13 +765,14 @@ static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx, #if CONFIG_TUNE_BUTTERAUGLI if (ctx->extra_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { - if (img->x_chroma_shift != 1 || img->y_chroma_shift != 1) { - ERROR("Only YV12/I420 images supported in tune=butteraugli mode."); + if (img->bit_depth > 8) { + ERROR("Only 8 bit depth images supported in tune=butteraugli mode."); } - if ((img->cp != 0 && img->cp != AOM_CICP_CP_BT_709) || - (img->tc != 0 && img->tc != AOM_CICP_TC_BT_709) || - (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709)) { - ERROR("Only BT.709 images supported in tune=butteraugli mode."); + if (img->mc != 0 && img->mc != AOM_CICP_MC_BT_709 && + img->mc != AOM_CICP_MC_BT_601 && img->mc != AOM_CICP_MC_BT_470_B_G) { + ERROR( + "Only BT.709 and BT.601 matrix coefficients supported in " + "tune=butteraugli mode. Identity matrix is treated as BT.601."); } } #endif @@ -689,7 +836,6 @@ static void update_default_encoder_config(const cfg_options_t *cfg, extra_cfg->enable_smooth_intra = (cfg->disable_smooth_intra == 0); extra_cfg->enable_paeth_intra = (cfg->disable_paeth_intra == 0); extra_cfg->enable_cfl_intra = (cfg->disable_cfl == 0); - extra_cfg->enable_diagonal_intra = (cfg->disable_diagonal_intra == 0); extra_cfg->enable_obmc = (cfg->disable_obmc == 0); extra_cfg->enable_palette = (cfg->disable_palette == 0); extra_cfg->enable_intrabc = (cfg->disable_intrabc == 0); @@ -709,12 +855,12 @@ static double convert_qp_offset(int cq_level, int q_offset, int bit_depth) { return (base_q_val - new_q_val); } -static double get_modeled_qp_offset(int cq_level, int level, int bit_depth) { - // 80% for keyframe was derived empirically. - // 40% similar to rc_pick_q_and_bounds_one_pass_vbr() for Q mode ARF. +static double get_modeled_qp_offset(int qp, int level, int bit_depth) { + // 76% for keyframe was derived empirically. + // 60% similar to rc_pick_q_and_bounds_one_pass_vbr() for Q mode ARF. // Rest derived similar to rc_pick_q_and_bounds_two_pass() - static const int percents[FIXED_QP_OFFSET_COUNT] = { 76, 60, 30, 15, 8 }; - const double q_val = av1_convert_qindex_to_q(cq_level, bit_depth); + static const int percents[FIXED_QP_OFFSET_COUNT] = { 76, 60, 30, 15, 8, 4 }; + const double q_val = av1_convert_qindex_to_q(qp, bit_depth); return q_val * percents[level] / 100; } @@ -916,6 +1062,7 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf, oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq; oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq; oxcf->cost_upd_freq.mv = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq; + oxcf->cost_upd_freq.dv = (COST_UPDATE_TYPE)extra_cfg->dv_cost_upd_freq; // Set frame resize mode configuration. resize_cfg->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode; @@ -1044,7 +1191,7 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf, oxcf->motion_mode_cfg.enable_obmc = extra_cfg->enable_obmc; oxcf->motion_mode_cfg.enable_warped_motion = extra_cfg->enable_warped_motion; oxcf->motion_mode_cfg.allow_warped_motion = - (cfg->g_usage == AOM_USAGE_REALTIME) + (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7) ? false : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion); @@ -1141,6 +1288,8 @@ static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf, sizeof(oxcf->target_seq_level_idx)); oxcf->tier_mask = extra_cfg->tier_mask; + oxcf->partition_info_path = extra_cfg->partition_info_path; + return AOM_CODEC_OK; } @@ -1179,10 +1328,20 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx, ctx->cfg = *cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); // On profile change, request a key frame - force_key |= ctx->ppi->cpi->common.seq_params.profile != ctx->oxcf.profile; - av1_change_config(ctx->ppi->cpi, &ctx->oxcf); + force_key |= ctx->ppi->seq_params.profile != ctx->oxcf.profile; + bool is_sb_size_changed = false; + av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed); +#if CONFIG_FRAME_PARALLEL_ENCODE + int i; + for (i = 0; i < ctx->ppi->num_fp_contexts; i++) { + av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf, + is_sb_size_changed); + } +#else + av1_change_config(ctx->ppi->cpi, &ctx->oxcf, is_sb_size_changed); +#endif // CONFIG_FRAME_PARALLEL_ENCODE if (ctx->ppi->cpi_lap != NULL) { - av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf); + av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed); } } @@ -1192,7 +1351,7 @@ static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx, } static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) { - return av1_get_global_headers(ctx->ppi->cpi); + return av1_get_global_headers(ctx->ppi); } static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx, @@ -1215,7 +1374,7 @@ static aom_codec_err_t ctrl_get_baseline_gf_interval(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return AOM_CODEC_INVALID_PARAM; - *arg = ctx->ppi->cpi->rc.baseline_gf_interval; + *arg = ctx->ppi->p_rc.baseline_gf_interval; return AOM_CODEC_OK; } @@ -1225,9 +1384,19 @@ static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx, if (res == AOM_CODEC_OK) { ctx->extra_cfg = *extra_cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); - av1_change_config(ctx->ppi->cpi, &ctx->oxcf); + bool is_sb_size_changed = false; + av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed); +#if CONFIG_FRAME_PARALLEL_ENCODE + int i; + for (i = 0; i < ctx->ppi->num_fp_contexts; i++) { + av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf, + is_sb_size_changed); + } +#else + av1_change_config(ctx->ppi->cpi, &ctx->oxcf, is_sb_size_changed); +#endif // CONFIG_FRAME_PARALLEL_ENCODE if (ctx->ppi->cpi_lap != NULL) { - av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf); + av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed); } } return res; @@ -1299,7 +1468,13 @@ static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx, static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.enable_tpl_model = CAST(AV1E_SET_ENABLE_TPL_MODEL, args); + const unsigned int tpl_model_arg = CAST(AV1E_SET_ENABLE_TPL_MODEL, args); +#if CONFIG_REALTIME_ONLY + if (tpl_model_arg) { + ERROR("TPL model can't be turned on in realtime only build."); + } +#endif + extra_cfg.enable_tpl_model = tpl_model_arg; return update_extra_cfg(ctx, &extra_cfg); } @@ -1379,7 +1554,13 @@ static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx, static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args); + const unsigned int restoration_arg = CAST(AV1E_SET_ENABLE_RESTORATION, args); +#if CONFIG_REALTIME_ONLY + if (restoration_arg) { + ERROR("Restoration can't be turned on in realtime only build."); + } +#endif + extra_cfg.enable_restoration = restoration_arg; return update_extra_cfg(ctx, &extra_cfg); } @@ -1393,7 +1574,13 @@ static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx, static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args); + const unsigned int obmc_arg = CAST(AV1E_SET_ENABLE_OBMC, args); +#if CONFIG_REALTIME_ONLY + if (obmc_arg) { + ERROR("OBMC can't be enabled in realtime only build."); + } +#endif + extra_cfg.enable_obmc = obmc_arg; return update_extra_cfg(ctx, &extra_cfg); } @@ -1637,14 +1824,26 @@ static aom_codec_err_t ctrl_set_enable_interintra_wedge( static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args); + const int global_motion_arg = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args); +#if CONFIG_REALTIME_ONLY + if (global_motion_arg) { + ERROR("Global motion can't be enabled in realtime only build."); + } +#endif + extra_cfg.enable_global_motion = global_motion_arg; return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args); + const int warped_motion_arg = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args); +#if CONFIG_REALTIME_ONLY + if (warped_motion_arg) { + ERROR("Warped motion can't be enabled in realtime only build."); + } +#endif + extra_cfg.enable_warped_motion = warped_motion_arg; return update_extra_cfg(ctx, &extra_cfg); } @@ -1825,6 +2024,13 @@ static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_dv_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.dv_cost_upd_freq = CAST(AV1E_SET_DV_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1832,6 +2038,13 @@ static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_partition_info_path(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.partition_info_path = CAST(AV1E_SET_PARTITION_INFO_PATH, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_film_grain_test_vector( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1890,7 +2103,13 @@ static aom_codec_err_t ctrl_set_enable_dnl_denoising(aom_codec_alg_priv_t *ctx, static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args); + const DELTAQ_MODE deltaq_arg = CAST(AV1E_SET_DELTAQ_MODE, args); +#if CONFIG_REALTIME_ONLY + if (deltaq_arg > NO_DELTA_Q) { + ERROR("Delta Q mode can't be enabled in realtime only build."); + } +#endif + extra_cfg.deltaq_mode = deltaq_arg; return update_extra_cfg(ctx, &extra_cfg); } @@ -1986,6 +2205,18 @@ static aom_codec_err_t ctrl_enable_sb_multipass_unit_test( return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_external_partition(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_COMP *const cpi = ctx->ppi->cpi; + aom_ext_part_funcs_t funcs = *CAST(AV1E_SET_EXTERNAL_PARTITION, args); + aom_ext_part_config_t config; + // TODO(chengchen): verify the sb_size has been set at this point. + config.superblock_size = cpi->common.seq_params->sb_size; + const aom_codec_err_t status = + av1_ext_part_create(funcs, config, &cpi->ext_part_controller); + return status; +} + #if !CONFIG_REALTIME_ONLY static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, STATS_BUFFER_CTX *stats_buf_context, @@ -2014,27 +2245,22 @@ static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, static aom_codec_err_t create_context_and_bufferpool( AV1_PRIMARY *ppi, AV1_COMP **p_cpi, BufferPool **p_buffer_pool, - AV1EncoderConfig *oxcf, struct aom_codec_pkt_list *pkt_list_head, - FIRSTPASS_STATS *frame_stats_buf, COMPRESSOR_STAGE stage, - int num_lap_buffers, int lap_lag_in_frames, - STATS_BUFFER_CTX *stats_buf_context) { + AV1EncoderConfig *oxcf, COMPRESSOR_STAGE stage, int lap_lag_in_frames) { aom_codec_err_t res = AOM_CODEC_OK; - *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool)); - if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR; + if (*p_buffer_pool == NULL) { + *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool)); + if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR; #if CONFIG_MULTITHREAD - if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) { - return AOM_CODEC_MEM_ERROR; - } + if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) { + return AOM_CODEC_MEM_ERROR; + } #endif - *p_cpi = av1_create_compressor(ppi, oxcf, *p_buffer_pool, frame_stats_buf, - stage, num_lap_buffers, lap_lag_in_frames, - stats_buf_context); - if (*p_cpi == NULL) - res = AOM_CODEC_MEM_ERROR; - else - (*p_cpi)->output_pkt_list = pkt_list_head; + } + *p_cpi = av1_create_compressor(ppi, oxcf, *p_buffer_pool, stage, + lap_lag_in_frames); + if (*p_cpi == NULL) res = AOM_CODEC_MEM_ERROR; return res; } @@ -2084,27 +2310,48 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) { priv->oxcf.use_highbitdepth = (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0; - priv->ppi = av1_create_primary_compressor(); + priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head, + *num_lap_buffers, &priv->oxcf); if (!priv->ppi) return AOM_CODEC_MEM_ERROR; #if !CONFIG_REALTIME_ONLY res = create_stats_buffer(&priv->frame_stats_buffer, &priv->stats_buf_context, *num_lap_buffers); if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR; + + assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS); + int size = get_stats_buf_size(*num_lap_buffers, MAX_LAG_BUFFERS); + for (int i = 0; i < size; i++) + priv->ppi->twopass.frame_stats_arr[i] = &priv->frame_stats_buffer[i]; + + priv->ppi->twopass.stats_buf_ctx = &priv->stats_buf_context; + priv->ppi->twopass.stats_in = + priv->ppi->twopass.stats_buf_ctx->stats_in_start; #endif - res = create_context_and_bufferpool( - priv->ppi, &priv->ppi->cpi, &priv->buffer_pool, &priv->oxcf, - &priv->pkt_list.head, priv->frame_stats_buffer, ENCODE_STAGE, - *num_lap_buffers, -1, &priv->stats_buf_context); +#if CONFIG_FRAME_PARALLEL_ENCODE + assert(priv->ppi->num_fp_contexts >= 1); + int i; + for (i = 0; i < priv->ppi->num_fp_contexts; i++) { + res = create_context_and_bufferpool( + priv->ppi, &priv->ppi->parallel_cpi[i], &priv->buffer_pool, + &priv->oxcf, ENCODE_STAGE, -1); + if (res != AOM_CODEC_OK) { + return res; + } + } + priv->ppi->cpi = priv->ppi->parallel_cpi[0]; +#else + res = create_context_and_bufferpool(priv->ppi, &priv->ppi->cpi, + &priv->buffer_pool, &priv->oxcf, + ENCODE_STAGE, -1); +#endif // CONFIG_FRAME_PARALLEL_ENCODE // Create another compressor if look ahead is enabled if (res == AOM_CODEC_OK && *num_lap_buffers) { res = create_context_and_bufferpool( priv->ppi, &priv->ppi->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf, - NULL, priv->frame_stats_buffer, LAP_STAGE, *num_lap_buffers, - clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS), - &priv->stats_buf_context); + LAP_STAGE, clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS)); } } } @@ -2113,12 +2360,16 @@ static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) { } static void destroy_context_and_bufferpool(AV1_COMP *cpi, - BufferPool *buffer_pool) { + BufferPool **p_buffer_pool) { av1_remove_compressor(cpi); + if (*p_buffer_pool) { + av1_free_ref_frame_buffers(*p_buffer_pool); #if CONFIG_MULTITHREAD - if (buffer_pool) pthread_mutex_destroy(&buffer_pool->pool_mutex); + pthread_mutex_destroy(&(*p_buffer_pool)->pool_mutex); #endif - aom_free(buffer_pool); + aom_free(*p_buffer_pool); + *p_buffer_pool = NULL; + } } static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, @@ -2133,9 +2384,30 @@ static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) { if (ctx->ppi) { AV1_PRIMARY *ppi = ctx->ppi; - destroy_context_and_bufferpool(ppi->cpi, ctx->buffer_pool); +#if CONFIG_FRAME_PARALLEL_ENCODE + for (int i = 0; i < ppi->num_fp_contexts - 1; i++) { + if (ppi->parallel_frames_data[i].cx_data_frame) { + free(ppi->parallel_frames_data[i].cx_data_frame); + } + } +#endif +#if CONFIG_ENTROPY_STATS + print_entropy_stats(ppi); +#endif +#if CONFIG_INTERNAL_STATS + print_internal_stats(ppi); +#endif +#if CONFIG_FRAME_PARALLEL_ENCODE + int i; + for (i = 0; i < ppi->num_fp_contexts; i++) { + destroy_context_and_bufferpool(ppi->parallel_cpi[i], &ctx->buffer_pool); + } + ppi->cpi = NULL; +#else + destroy_context_and_bufferpool(ppi->cpi, &ctx->buffer_pool); +#endif // CONFIG_FRAME_PARALLEL_ENCODE if (ppi->cpi_lap) { - destroy_context_and_bufferpool(ppi->cpi_lap, ctx->buffer_pool_lap); + destroy_context_and_bufferpool(ppi->cpi_lap, &ctx->buffer_pool_lap); } av1_remove_primary_compressor(ppi); } @@ -2151,7 +2423,7 @@ static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi, aom_codec_frame_flags_t flags = lib_flags << 16; if (lib_flags & FRAMEFLAGS_KEY || - (cpi->use_svc && + (cpi->ppi->use_svc && svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + svc->temporal_layer_id] .is_key_frame)) @@ -2182,7 +2454,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, AV1_COMP *cpi_lap = ppi->cpi_lap; if (cpi == NULL) return AOM_CODEC_INVALID_PARAM; - if (cpi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0) + if (cpi->ppi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0) return AOM_CODEC_INVALID_PARAM; if (img != NULL) { @@ -2216,6 +2488,22 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, return AOM_CODEC_MEM_ERROR; } } +#if CONFIG_FRAME_PARALLEL_ENCODE + for (int i = 0; i < cpi->ppi->num_fp_contexts - 1; i++) { + if (cpi->ppi->parallel_frames_data[i].cx_data_frame == NULL) { + cpi->ppi->parallel_frames_data[i].cx_data_sz = uncompressed_frame_sz; + cpi->ppi->parallel_frames_data[i].frame_display_order_hint = -1; + cpi->ppi->parallel_frames_data[i].frame_size = 0; + cpi->ppi->parallel_frames_data[i].cx_data_frame = + (unsigned char *)malloc( + cpi->ppi->parallel_frames_data[i].cx_data_sz); + if (cpi->ppi->parallel_frames_data[i].cx_data_frame == NULL) { + cpi->ppi->parallel_frames_data[i].cx_data_sz = 0; + return AOM_CODEC_MEM_ERROR; + } + } + } +#endif } } @@ -2226,22 +2514,16 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. - if (setjmp(cpi->common.error.jmp)) { - cpi->common.error.setjmp = 0; - res = update_error_state(ctx, &cpi->common.error); + if (setjmp(ppi->error.jmp)) { + ppi->error.setjmp = 0; + res = update_error_state(ctx, &ppi->error); aom_clear_system_state(); return res; } - cpi->common.error.setjmp = 1; - if (cpi_lap != NULL) { - if (setjmp(cpi_lap->common.error.jmp)) { - cpi_lap->common.error.setjmp = 0; - res = update_error_state(ctx, &cpi_lap->common.error); - aom_clear_system_state(); - return res; - } - cpi_lap->common.error.setjmp = 1; - } + ppi->error.setjmp = 1; + + if (cpi->ppi->use_svc && cpi->svc.use_flexible_mode == 0 && flags == 0) + av1_set_svc_fixed_mode(cpi); // Note(yunqing): While applying encoding flags, always start from enabling // all, and then modifying according to the flags. Previous frame's flags are @@ -2251,9 +2533,12 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, av1_apply_encoding_flags(cpi_lap, flags); } -#if CONFIG_USE_VMAF_RC - aom_init_vmaf_model_rc(&cpi->vmaf_info.vmaf_model, - cpi->oxcf.tune_cfg.vmaf_model_path); +#if CONFIG_TUNE_VMAF + if (ctx->extra_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING && + ctx->extra_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) { + aom_init_vmaf_model(&cpi->vmaf_info.vmaf_model, + cpi->oxcf.tune_cfg.vmaf_model_path); + } #endif // Handle fixed keyframe intervals @@ -2270,7 +2555,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, if (res == AOM_CODEC_OK) { // Set up internal flags - if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; + if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) + cpi->ppi->b_calculate_psnr = 1; if (img != NULL) { if (!ctx->pts_offset_initialized) { @@ -2306,11 +2592,18 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, cpi->oxcf.tool_cfg.enable_global_motion); } if (!ppi->lookahead) - aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); - +#if CONFIG_FRAME_PARALLEL_ENCODE + int i; + for (i = 0; i < ppi->num_fp_contexts; i++) { + av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth, + subsampling_x, subsampling_y); + } +#else av1_check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); +#endif if (cpi_lap != NULL) { av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x, subsampling_y); @@ -2320,7 +2613,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, // key frame flag when we actually encode this frame. if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, src_time_stamp, src_end_time_stamp)) { - res = update_error_state(ctx, &cpi->common.error); + res = update_error_state(ctx, &ppi->error); } ctx->next_frame_flags = 0; } @@ -2337,7 +2630,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, * the buffer size anyway. */ if (cx_data_sz < ctx->cx_data_sz / 2) { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, "Compressed data buffer too small"); } } @@ -2358,6 +2651,12 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, } if ((num_workers > 1) && (cpi->mt_info.num_workers == 0)) { av1_create_workers(cpi, num_workers); +#if CONFIG_MULTITHREAD + av1_init_mt_sync(cpi, cpi->oxcf.pass == 1); + if (cpi_lap != NULL) { + av1_init_mt_sync(cpi_lap, 1); + } +#endif // CONFIG_MULTITHREAD if (cpi->oxcf.pass != 1) { av1_create_second_pass_workers(cpi, num_workers); } @@ -2373,13 +2672,12 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, } cpi_lap->mt_info.num_workers = cpi->mt_info.num_workers; const int status = av1_get_compressed_data( - cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la, - &dst_end_time_stamp_la, !img, timestamp_ratio); + cpi_lap, &lib_flags, &frame_size, cx_data_sz, NULL, + &dst_time_stamp_la, &dst_end_time_stamp_la, !img, timestamp_ratio); if (status != -1) { if (status != AOM_CODEC_OK) { - aom_internal_error(&cpi_lap->common.error, AOM_CODEC_ERROR, NULL); + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } - cpi_lap->seq_params_locked = 1; } lib_flags = 0; frame_size = 0; @@ -2390,15 +2688,39 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, int64_t dst_time_stamp; int64_t dst_end_time_stamp; while (cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) { +#if CONFIG_FRAME_PARALLEL_ENCODE + cpi->do_frame_data_update = true; + if (ppi->num_fp_contexts > 1 && ppi->gf_group.size > 1) { + if (cpi->gf_frame_index < ppi->gf_group.size) { + calc_frame_data_update_flag(&ppi->gf_group, cpi->gf_frame_index, + &cpi->do_frame_data_update); + } + } +#endif const int status = av1_get_compressed_data( - cpi, &lib_flags, &frame_size, cx_data, &dst_time_stamp, + cpi, &lib_flags, &frame_size, cx_data_sz, cx_data, &dst_time_stamp, &dst_end_time_stamp, !img, timestamp_ratio); if (status == -1) break; if (status != AOM_CODEC_OK) { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } - cpi->seq_params_locked = 1; +#if CONFIG_ENTROPY_STATS + if (ppi->cpi->oxcf.pass != 1 && !cpi->common.show_existing_frame) + av1_accumulate_frame_counts(&ppi->aggregate_fc, &cpi->counts); +#endif +#if CONFIG_INTERNAL_STATS + if (ppi->cpi->oxcf.pass != 1) { + ppi->total_time_compress_data += cpi->time_compress_data; + ppi->total_recode_hits += cpi->frame_recode_hits; + ppi->total_bytes += cpi->bytes; + for (int i = 0; i < MAX_MODES; i++) { + ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i]; + } + } +#endif // CONFIG_INTERNAL_STATS + + cpi->ppi->seq_params_locked = 1; if (!frame_size) continue; assert(cx_data != NULL && cx_data_sz != 0); const int write_temporal_delimiter = @@ -2413,12 +2735,13 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, const size_t move_offset = obu_header_size + length_field_size; memmove(ctx->cx_data + move_offset, ctx->cx_data, frame_size); obu_header_size = av1_write_obu_header( - &cpi->level_params, OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data); + &cpi->ppi->level_params, &cpi->frame_header_count, + OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data); // OBUs are preceded/succeeded by an unsigned leb128 coded integer. if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, ctx->cx_data) != AOM_CODEC_OK) { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } frame_size += obu_header_size + obu_payload_size + length_field_size; @@ -2428,7 +2751,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, size_t curr_frame_size = frame_size; if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) != AOM_CODEC_OK) { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } frame_size = curr_frame_size; @@ -2437,7 +2760,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, memmove(cx_data + length_field_size, cx_data, frame_size); if (av1_write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) != AOM_CODEC_OK) { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } frame_size += length_field_size; } @@ -2458,7 +2781,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, aom_codec_cx_pkt_t pkt; // decrement frames_left counter - cpi->frames_left = AOMMAX(0, cpi->frames_left - 1); + cpi->ppi->frames_left = AOMMAX(0, cpi->ppi->frames_left - 1); if (ctx->oxcf.save_as_annexb) { // B_PRIME (add TU size) size_t tu_size = ctx->pending_cx_data_sz; @@ -2466,7 +2789,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, memmove(ctx->cx_data + length_field_size, ctx->cx_data, tu_size); if (av1_write_uleb_obu_size(0, (uint32_t)tu_size, ctx->cx_data) != AOM_CODEC_OK) { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL); + aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL); } ctx->pending_cx_data_sz += length_field_size; } @@ -2496,7 +2819,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, } } - cpi->common.error.setjmp = 0; + ppi->error.setjmp = 0; return res; } @@ -2674,7 +2997,7 @@ static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx, const int number_spatial_layers = va_arg(args, int); if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS) return AOM_CODEC_INVALID_PARAM; - ctx->ppi->cpi->common.number_spatial_layers = number_spatial_layers; + ctx->ppi->number_spatial_layers = number_spatial_layers; return AOM_CODEC_OK; } @@ -2690,19 +3013,20 @@ static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx, static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx, va_list args) { - AV1_COMP *const cpi = ctx->ppi->cpi; + AV1_PRIMARY *const ppi = ctx->ppi; + AV1_COMP *const cpi = ppi->cpi; AV1_COMMON *const cm = &cpi->common; aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *); - cm->number_spatial_layers = params->number_spatial_layers; - cm->number_temporal_layers = params->number_temporal_layers; + ppi->number_spatial_layers = params->number_spatial_layers; + ppi->number_temporal_layers = params->number_temporal_layers; cpi->svc.number_spatial_layers = params->number_spatial_layers; cpi->svc.number_temporal_layers = params->number_temporal_layers; - if (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1) { + if (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) { unsigned int sl, tl; - cpi->use_svc = 1; - for (sl = 0; sl < cm->number_spatial_layers; ++sl) { - for (tl = 0; tl < cm->number_temporal_layers; ++tl) { - const int layer = LAYER_IDS_TO_IDX(sl, tl, cm->number_temporal_layers); + ctx->ppi->use_svc = 1; + for (sl = 0; sl < ppi->number_spatial_layers; ++sl) { + for (tl = 0; tl < ppi->number_temporal_layers; ++tl) { + const int layer = LAYER_IDS_TO_IDX(sl, tl, ppi->number_temporal_layers); LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; lc->max_q = params->max_quantizers[layer]; lc->min_q = params->min_quantizers[layer]; @@ -2713,11 +3037,11 @@ static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx, } } if (cm->current_frame.frame_number == 0) { - if (!cpi->seq_params_locked) { - SequenceHeader *const seq_params = &cm->seq_params; + if (!cpi->ppi->seq_params_locked) { + SequenceHeader *const seq_params = &ppi->seq_params; seq_params->operating_points_cnt_minus_1 = - cm->number_spatial_layers * cm->number_temporal_layers - 1; - av1_init_seq_coding_tools(&cm->seq_params, cm, &cpi->oxcf, 1); + ppi->number_spatial_layers * ppi->number_temporal_layers - 1; + av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1); } av1_init_layer_context(cpi); } @@ -2732,13 +3056,15 @@ static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx, AV1_COMP *const cpi = ctx->ppi->cpi; aom_svc_ref_frame_config_t *const data = va_arg(args, aom_svc_ref_frame_config_t *); - cpi->svc.external_ref_frame_config = 1; + cpi->svc.set_ref_frame_config = 1; for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) { cpi->svc.reference[i] = data->reference[i]; cpi->svc.ref_idx[i] = data->ref_idx[i]; } for (unsigned int i = 0; i < REF_FRAMES; ++i) cpi->svc.refresh[i] = data->refresh[i]; + cpi->svc.use_flexible_mode = 1; + cpi->svc.ksvc_fixed_mode = 0; return AOM_CODEC_OK; } @@ -2831,18 +3157,17 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx, // Used to mock the argv with just one string "--{name}={value}" char *argv[2] = { NULL, "" }; size_t len = strlen(name) + strlen(value) + 4; - char *err_string = ctx->ppi->cpi->common.error.detail; + char *err_string = ctx->ppi->error.detail; #if __STDC_VERSION__ >= 201112L // We use the keyword _Static_assert because clang-cl does not allow the // convenience macro static_assert to be used in function scope. See // https://bugs.llvm.org/show_bug.cgi?id=48904. - _Static_assert( - sizeof(ctx->ppi->cpi->common.error.detail) >= ARG_ERR_MSG_MAX_LEN, - "The size of the err_msg buffer for arg_match_helper must be " - "at least ARG_ERR_MSG_MAX_LEN"); + _Static_assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN, + "The size of the err_msg buffer for arg_match_helper must be " + "at least ARG_ERR_MSG_MAX_LEN"); #else - assert(sizeof(ctx->ppi->cpi->common.error.detail) >= ARG_ERR_MSG_MAX_LEN); + assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN); #endif argv[0] = aom_malloc(len * sizeof(argv[1][0])); @@ -2909,8 +3234,11 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx, extra_cfg.vmaf_model_path = value; } #endif - else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv, - err_string)) { + else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.partition_info_path, + argv, err_string)) { + extra_cfg.partition_info_path = value; + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv, + err_string)) { extra_cfg.cq_level = arg_parse_uint_helper(&arg, err_string); } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_intra_rate_pct, argv, err_string)) { @@ -3161,6 +3489,9 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx, } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mv_cost_upd_freq, argv, err_string)) { extra_cfg.mv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); + } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dv_cost_upd_freq, + argv, err_string)) { + extra_cfg.dv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string); } #if CONFIG_DENOISE else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_noise_level, @@ -3215,9 +3546,8 @@ static aom_codec_err_t encoder_set_option(aom_codec_alg_priv_t *ctx, static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); - const AV1_COMP *const cpi = ctx->ppi->cpi; if (arg == NULL) return AOM_CODEC_INVALID_PARAM; - return av1_get_seq_level_idx(&cpi->common.seq_params, &cpi->level_params, + return av1_get_seq_level_idx(&ctx->ppi->seq_params, &ctx->ppi->level_params, arg); } @@ -3332,6 +3662,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size }, { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding }, { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path }, + { AV1E_SET_PARTITION_INFO_PATH, ctrl_set_partition_info_path }, { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector }, { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table }, { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level }, @@ -3347,6 +3678,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config }, { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap }, { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test }, + { AV1E_SET_DV_COST_UPD_FREQ, ctrl_set_dv_cost_upd_freq }, + { AV1E_SET_EXTERNAL_PARTITION, ctrl_set_external_partition }, // Getters { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer }, @@ -3364,6 +3697,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { }; static const aom_codec_enc_cfg_t encoder_usage_cfg[] = { +#if !CONFIG_REALTIME_ONLY { // NOLINT AOM_USAGE_GOOD_QUALITY, // g_usage - non-realtime usage @@ -3415,25 +3749,26 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = { 2000, // rc_two_pass_vbrmax_section // keyframing settings (kf) - 0, // fwd_kf_enabled - AOM_KF_AUTO, // kf_mode - 0, // kf_min_dist - 9999, // kf_max_dist - 0, // sframe_dist - 1, // sframe_mode - 0, // large_scale_tile - 0, // monochrome - 0, // full_still_picture_hdr - 0, // save_as_annexb - 0, // tile_width_count - 0, // tile_height_count - { 0 }, // tile_widths - { 0 }, // tile_heights - 0, // use_fixed_qp_offsets - { -1, -1, -1, -1, -1 }, // fixed_qp_offsets + 0, // fwd_kf_enabled + AOM_KF_AUTO, // kf_mode + 0, // kf_min_dist + 9999, // kf_max_dist + 0, // sframe_dist + 1, // sframe_mode + 0, // large_scale_tile + 0, // monochrome + 0, // full_still_picture_hdr + 0, // save_as_annexb + 0, // tile_width_count + 0, // tile_height_count + { 0 }, // tile_widths + { 0 }, // tile_heights + 0, // use_fixed_qp_offsets + { -1, -1, -1, -1, -1, -1 }, // fixed_qp_offsets { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg }, +#endif // !CONFIG_REALTIME_ONLY { // NOLINT AOM_USAGE_REALTIME, // g_usage - real-time usage @@ -3485,25 +3820,26 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = { 2000, // rc_two_pass_vbrmax_section // keyframing settings (kf) - 0, // fwd_kf_enabled - AOM_KF_AUTO, // kf_mode - 0, // kf_min_dist - 9999, // kf_max_dist - 0, // sframe_dist - 1, // sframe_mode - 0, // large_scale_tile - 0, // monochrome - 0, // full_still_picture_hdr - 0, // save_as_annexb - 0, // tile_width_count - 0, // tile_height_count - { 0 }, // tile_widths - { 0 }, // tile_heights - 0, // use_fixed_qp_offsets - { -1, -1, -1, -1, -1 }, // fixed_qp_offsets + 0, // fwd_kf_enabled + AOM_KF_AUTO, // kf_mode + 0, // kf_min_dist + 9999, // kf_max_dist + 0, // sframe_dist + 1, // sframe_mode + 0, // large_scale_tile + 0, // monochrome + 0, // full_still_picture_hdr + 0, // save_as_annexb + 0, // tile_width_count + 0, // tile_height_count + { 0 }, // tile_widths + { 0 }, // tile_heights + 0, // use_fixed_qp_offsets + { -1, -1, -1, -1, -1, -1 }, // fixed_qp_offsets { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg }, +#if !CONFIG_REALTIME_ONLY { // NOLINT AOM_USAGE_ALL_INTRA, // g_usage - all intra usage @@ -3572,8 +3908,9 @@ static const aom_codec_enc_cfg_t encoder_usage_cfg[] = { 0, // use_fixed_qp_offsets { -1, -1, -1, -1, -1 }, // fixed_qp_offsets { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // cfg }, +#endif // !CONFIG_REALTIME_ONLY }; // This data structure and function are exported in aom/aomcx.h @@ -3598,13 +3935,13 @@ aom_codec_iface_t aom_codec_av1_cx_algo = { }, { // NOLINT - 3, // 3 cfg - encoder_usage_cfg, // aom_codec_enc_cfg_t - encoder_encode, // aom_codec_encode_fn_t - encoder_get_cxdata, // aom_codec_get_cx_data_fn_t - encoder_set_config, // aom_codec_enc_config_set_fn_t - encoder_get_global_headers, // aom_codec_get_global_headers_fn_t - encoder_get_preview // aom_codec_get_preview_frame_fn_t + NELEMENTS(encoder_usage_cfg), // cfg_count + encoder_usage_cfg, // aom_codec_enc_cfg_t + encoder_encode, // aom_codec_encode_fn_t + encoder_get_cxdata, // aom_codec_get_cx_data_fn_t + encoder_set_config, // aom_codec_enc_config_set_fn_t + encoder_get_global_headers, // aom_codec_get_global_headers_fn_t + encoder_get_preview // aom_codec_get_preview_frame_fn_t }, encoder_set_option // aom_codec_set_option_fn_t }; diff --git a/third_party/libaom/source/libaom/av1/av1_dx_iface.c b/third_party/libaom/source/libaom/av1/av1_dx_iface.c index 1ee8a576d3..02968abd16 100644 --- a/third_party/libaom/source/libaom/av1/av1_dx_iface.c +++ b/third_party/libaom/source/libaom/av1/av1_dx_iface.c @@ -115,14 +115,18 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) { if (ctx->frame_worker != NULL) { AVxWorker *const worker = ctx->frame_worker; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; aom_get_worker_interface()->end(worker); - aom_free(frame_worker_data->pbi->common.tpl_mvs); - frame_worker_data->pbi->common.tpl_mvs = NULL; + aom_free(pbi->common.tpl_mvs); + pbi->common.tpl_mvs = NULL; av1_remove_common(&frame_worker_data->pbi->common); + av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync, + pbi->num_workers); + av1_free_cdef_sync(&pbi->cdef_sync); #if !CONFIG_REALTIME_ONLY - av1_free_restoration_buffers(&frame_worker_data->pbi->common); + av1_free_restoration_buffers(&pbi->common); #endif - av1_decoder_remove(frame_worker_data->pbi); + av1_decoder_remove(pbi); aom_free(frame_worker_data); #if CONFIG_MULTITHREAD pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); @@ -392,7 +396,7 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) { pool->release_fb_cb = av1_release_frame_buffer; if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to initialize internal frame buffers"); pool->cb_priv = &pool->int_frame_buffers; @@ -527,7 +531,7 @@ static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx, *data = frame_worker_data->data_end; if (worker->had_error) - return update_error_state(ctx, &frame_worker_data->pbi->common.error); + return update_error_state(ctx, &frame_worker_data->pbi->error); check_resync(ctx, frame_worker_data->pbi); @@ -558,7 +562,7 @@ static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx, check_resync(ctx, frame_worker_data->pbi); if (ctx->frame_worker->had_error) - return update_error_state(ctx, &frame_worker_data->pbi->common.error); + return update_error_state(ctx, &frame_worker_data->pbi->error); // Allow extra zero bytes after the frame end while (data < data_end) { @@ -823,7 +827,7 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, aom_image_t *res = add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params); if (!res) { - aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Grain systhesis failed\n"); } *index += 1; // Advance the iterator to point to the next image @@ -1091,10 +1095,9 @@ static aom_codec_err_t ctrl_get_still_picture(aom_codec_alg_priv_t *ctx, FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; - still_picture_info->is_still_picture = - (int)pbi->common.seq_params.still_picture; + still_picture_info->is_still_picture = (int)pbi->seq_params.still_picture; still_picture_info->is_reduced_still_picture_hdr = - (int)(pbi->common.seq_params.reduced_still_picture_hdr); + (int)(pbi->seq_params.reduced_still_picture_hdr); return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; @@ -1112,7 +1115,7 @@ static aom_codec_err_t ctrl_get_sb_size(aom_codec_alg_priv_t *ctx, FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1Decoder *pbi = frame_worker_data->pbi; - if (pbi->common.seq_params.sb_size == BLOCK_128X128) { + if (pbi->seq_params.sb_size == BLOCK_128X128) { *sb_size = AOM_SUPERBLOCK_SIZE_128X128; } else { *sb_size = AOM_SUPERBLOCK_SIZE_64X64; @@ -1291,7 +1294,7 @@ static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx, FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; - *bit_depth = cm->seq_params.bit_depth; + *bit_depth = cm->seq_params->bit_depth; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; @@ -1327,9 +1330,9 @@ static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx, (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; - *img_fmt = get_img_format(cm->seq_params.subsampling_x, - cm->seq_params.subsampling_y, - cm->seq_params.use_highbitdepth); + *img_fmt = get_img_format(cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth); return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; diff --git a/third_party/libaom/source/libaom/av1/common/alloccommon.c b/third_party/libaom/source/libaom/av1/common/alloccommon.c index cd997cd875..8624255218 100644 --- a/third_party/libaom/source/libaom/av1/common/alloccommon.c +++ b/third_party/libaom/source/libaom/av1/common/alloccommon.c @@ -17,8 +17,10 @@ #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" +#include "av1/common/cdef_block.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" +#include "av1/common/thread_common.h" int av1_get_MBs(int width, int height) { const int aligned_width = ALIGN_POWER_OF_TWO(width, 3); @@ -51,6 +53,227 @@ void av1_free_ref_frame_buffers(BufferPool *pool) { } } +static INLINE void free_cdef_linebuf_conditional( + AV1_COMMON *const cm, const size_t *new_linebuf_size) { + CdefInfo *cdef_info = &cm->cdef_info; + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) { + aom_free(cdef_info->linebuf[plane]); + cdef_info->linebuf[plane] = NULL; + } + } +} + +static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm, + uint16_t **colbuf, + uint16_t **srcbuf, + const size_t *new_colbuf_size, + const size_t new_srcbuf_size) { + CdefInfo *cdef_info = &cm->cdef_info; + if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) { + aom_free(*srcbuf); + *srcbuf = NULL; + } + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) { + aom_free(colbuf[plane]); + colbuf[plane] = NULL; + } + } +} + +static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) { + aom_free(*srcbuf); + *srcbuf = NULL; + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + aom_free(colbuf[plane]); + colbuf[plane] = NULL; + } +} + +static INLINE void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt, + const int num_mi_rows) { + if (*cdef_row_mt == NULL) return; +#if CONFIG_MULTITHREAD + for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { + pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_); + pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_); + aom_free((*cdef_row_mt)[row_idx].row_mutex_); + aom_free((*cdef_row_mt)[row_idx].row_cond_); + } +#else + (void)num_mi_rows; +#endif // CONFIG_MULTITHREAD + aom_free(*cdef_row_mt); + *cdef_row_mt = NULL; +} + +void av1_free_cdef_buffers(AV1_COMMON *const cm, + AV1CdefWorkerData **cdef_worker, + AV1CdefSync *cdef_sync, int num_workers) { + CdefInfo *cdef_info = &cm->cdef_info; + const int num_mi_rows = cdef_info->allocated_mi_rows; + + for (int plane = 0; plane < MAX_MB_PLANE; plane++) { + aom_free(cdef_info->linebuf[plane]); + cdef_info->linebuf[plane] = NULL; + } + // De-allocation of column buffer & source buffer (worker_0). + free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf); + + if (num_workers < 2) return; + if (*cdef_worker != NULL) { + for (int idx = num_workers - 1; idx >= 1; idx--) { + // De-allocation of column buffer & source buffer for remaining workers. + free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf); + } + aom_free(*cdef_worker); + *cdef_worker = NULL; + } + free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows); +} + +static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf, + const int num_planes) { + CdefInfo *cdef_info = &cm->cdef_info; + for (int plane = 0; plane < num_planes; plane++) { + if (linebuf[plane] == NULL) + CHECK_MEM_ERROR(cm, linebuf[plane], + aom_malloc(cdef_info->allocated_linebuf_size[plane])); + } +} + +static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf, + uint16_t **srcbuf, const int num_planes) { + CdefInfo *cdef_info = &cm->cdef_info; + if (*srcbuf == NULL) + CHECK_MEM_ERROR(cm, *srcbuf, + aom_memalign(16, cdef_info->allocated_srcbuf_size)); + + for (int plane = 0; plane < num_planes; plane++) { + if (colbuf[plane] == NULL) + CHECK_MEM_ERROR(cm, colbuf[plane], + aom_malloc(cdef_info->allocated_colbuf_size[plane])); + } +} + +static INLINE void alloc_cdef_row_sync(AV1_COMMON *const cm, + AV1CdefRowSync **cdef_row_mt, + const int num_mi_rows) { + if (*cdef_row_mt != NULL) return; + + CHECK_MEM_ERROR(cm, *cdef_row_mt, + aom_malloc(sizeof(**cdef_row_mt) * num_mi_rows)); +#if CONFIG_MULTITHREAD + for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { + CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_, + aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_))); + pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL); + + CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_, + aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_))); + pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL); + + (*cdef_row_mt)[row_idx].is_row_done = 0; + } +#endif // CONFIG_MULTITHREAD +} + +void av1_alloc_cdef_buffers(AV1_COMMON *const cm, + AV1CdefWorkerData **cdef_worker, + AV1CdefSync *cdef_sync, int num_workers) { + const int num_planes = av1_num_planes(cm); + size_t new_linebuf_size[MAX_MB_PLANE] = { 0 }; + size_t new_colbuf_size[MAX_MB_PLANE] = { 0 }; + size_t new_srcbuf_size = 0; + CdefInfo *const cdef_info = &cm->cdef_info; + // Check for configuration change + const int num_mi_rows = + (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int is_num_workers_changed = + cdef_info->allocated_num_workers != num_workers; + const int is_cdef_enabled = + cm->seq_params->enable_cdef && !cm->tiles.large_scale; + + // num-bufs=3 represents ping-pong buffers for top linebuf, + // followed by bottom linebuf. + // ping-pong is to avoid top linebuf over-write by consecutive row. + int num_bufs = 3; + if (num_workers > 1) + num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + + if (is_cdef_enabled) { + // Calculate src buffer size + new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE; + for (int plane = 0; plane < num_planes; plane++) { + const int shift = + plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x; + // Calculate top and bottom line buffer size + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs * + (CDEF_VBORDER << 1) * (luma_stride >> shift); + // Calculate column buffer size + const int block_height = + (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER; + new_colbuf_size[plane] = + sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER; + } + } + + // Free src, line and column buffers for worker 0 in case of reallocation + free_cdef_linebuf_conditional(cm, new_linebuf_size); + free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf, + new_colbuf_size, new_srcbuf_size); + + if (*cdef_worker != NULL) { + if (is_num_workers_changed) { + // Free src and column buffers for remaining workers in case of change in + // num_workers + for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--) + free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf); + } else if (num_workers > 1) { + // Free src and column buffers for remaining workers in case of + // reallocation + for (int idx = num_workers - 1; idx >= 1; idx--) + free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf, + &(*cdef_worker)[idx].srcbuf, new_colbuf_size, + new_srcbuf_size); + } + } + + if (cdef_info->allocated_mi_rows != num_mi_rows) + free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows); + + // Store allocated sizes for reallocation + cdef_info->allocated_srcbuf_size = new_srcbuf_size; + av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size); + av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size); + // Store configuration to check change in configuration + cdef_info->allocated_mi_rows = num_mi_rows; + cdef_info->allocated_num_workers = num_workers; + + if (!is_cdef_enabled) return; + + // Memory allocation of column buffer & source buffer (worker_0). + alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes); + alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes); + + if (num_workers < 2) return; + + if (*cdef_worker == NULL) + CHECK_MEM_ERROR(cm, *cdef_worker, + aom_calloc(num_workers, sizeof(**cdef_worker))); + + // Memory allocation of column buffer & source buffer for remaining workers. + for (int idx = num_workers - 1; idx >= 1; idx--) + alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf, + num_planes); + + alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt, + cdef_info->allocated_mi_rows); +} + #if !CONFIG_REALTIME_ONLY // Assumes cm->rst_info[p].restoration_unit_size is already initialized void av1_alloc_restoration_buffers(AV1_COMMON *cm) { @@ -86,11 +309,11 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) { // Now we need to allocate enough space to store the line buffers for the // stripes const int frame_w = cm->superres_upscaled_width; - const int use_highbd = cm->seq_params.use_highbitdepth; + const int use_highbd = cm->seq_params->use_highbitdepth; for (int p = 0; p < num_planes; ++p) { const int is_uv = p > 0; - const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_x = is_uv && cm->seq_params->subsampling_x; const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ; const int stride = ALIGN_POWER_OF_TWO(plane_w, 5); const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT diff --git a/third_party/libaom/source/libaom/av1/common/alloccommon.h b/third_party/libaom/source/libaom/av1/common/alloccommon.h index e75c226831..0b43889d20 100644 --- a/third_party/libaom/source/libaom/av1/common/alloccommon.h +++ b/third_party/libaom/source/libaom/av1/common/alloccommon.h @@ -24,6 +24,8 @@ struct AV1Common; struct BufferPool; struct CommonContexts; struct CommonModeInfoParams; +struct AV1CdefWorker; +struct AV1CdefSyncData; void av1_remove_common(struct AV1Common *cm); @@ -36,6 +38,12 @@ void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params); void av1_free_context_buffers(struct AV1Common *cm); void av1_free_ref_frame_buffers(struct BufferPool *pool); +void av1_alloc_cdef_buffers(struct AV1Common *const cm, + struct AV1CdefWorker **cdef_worker, + struct AV1CdefSyncData *cdef_sync, int num_workers); +void av1_free_cdef_buffers(struct AV1Common *const cm, + struct AV1CdefWorker **cdef_worker, + struct AV1CdefSyncData *cdef_sync, int num_workers); #if !CONFIG_REALTIME_ONLY void av1_alloc_restoration_buffers(struct AV1Common *cm); void av1_free_restoration_buffers(struct AV1Common *cm); diff --git a/third_party/libaom/source/libaom/av1/common/av1_common_int.h b/third_party/libaom/source/libaom/av1/common/av1_common_int.h index 0a68cb5fd5..981a186579 100644 --- a/third_party/libaom/source/libaom/av1/common/av1_common_int.h +++ b/third_party/libaom/source/libaom/av1/common/av1_common_int.h @@ -135,7 +135,10 @@ typedef struct RefCntBuffer { // distance when a very old frame is used as a reference. unsigned int display_order_hint; unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME]; - +#if CONFIG_FRAME_PARALLEL_ENCODE + // Frame's level within the hierarchical structure. + unsigned int pyramid_level; +#endif // CONFIG_FRAME_PARALLEL_ENCODE MV_REF *mvs; uint8_t *seg_map; struct segmentation seg; @@ -192,12 +195,32 @@ typedef struct BufferPool { /*!\brief Parameters related to CDEF */ typedef struct { - int cdef_damping; /*!< CDEF damping factor */ - int nb_cdef_strengths; /*!< Number of CDEF strength values */ - int cdef_strengths[CDEF_MAX_STRENGTHS]; /*!< CDEF strength values for luma */ - int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; /*!< CDEF strength values for - chroma */ - int cdef_bits; /*!< Number of CDEF strength values in bits */ + //! CDEF column line buffer + uint16_t *colbuf[MAX_MB_PLANE]; + //! CDEF top & bottom line buffer + uint16_t *linebuf[MAX_MB_PLANE]; + //! CDEF intermediate buffer + uint16_t *srcbuf; + //! CDEF column line buffer sizes + size_t allocated_colbuf_size[MAX_MB_PLANE]; + //! CDEF top and bottom line buffer sizes + size_t allocated_linebuf_size[MAX_MB_PLANE]; + //! CDEF intermediate buffer size + size_t allocated_srcbuf_size; + //! CDEF damping factor + int cdef_damping; + //! Number of CDEF strength values + int nb_cdef_strengths; + //! CDEF strength values for luma + int cdef_strengths[CDEF_MAX_STRENGTHS]; + //! CDEF strength values for chroma + int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; + //! Number of CDEF strength values in bits + int cdef_bits; + //! Number of rows in the frame in 4 pixel + int allocated_mi_rows; + //! Number of CDEF workers + int allocated_num_workers; } CdefInfo; /*!\cond */ @@ -320,6 +343,10 @@ typedef struct { unsigned int order_hint; unsigned int display_order_hint; +#if CONFIG_FRAME_PARALLEL_ENCODE + // Frame's level within the hierarchical structure. + unsigned int pyramid_level; +#endif // CONFIG_FRAME_PARALLEL_ENCODE unsigned int frame_number; SkipModeInfo skip_mode_info; int refresh_frame_flags; // Which ref frames are overwritten by this frame @@ -602,12 +629,12 @@ struct CommonQuantParams { /*! * Delta of qindex (from base_qindex) for V plane DC coefficients. - * Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0. + * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0. */ int u_ac_delta_q; /*! * Delta of qindex (from base_qindex) for V plane AC coefficients. - * Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0. + * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0. */ int v_ac_delta_q; @@ -728,7 +755,7 @@ typedef struct AV1Common { /*! * Code and details about current error status. */ - struct aom_internal_error_info error; + struct aom_internal_error_info *error; /*! * AV1 allows two types of frame scaling operations: @@ -780,10 +807,6 @@ typedef struct AV1Common { uint8_t superres_scale_denominator; /*! - * If true, buffer removal times are present. - */ - bool buffer_removal_time_present; - /*! * buffer_removal_times[op_num] specifies the frame removal time in units of * DecCT clock ticks counted from the removal time of the last random access * point for operating point op_num. @@ -950,7 +973,7 @@ typedef struct AV1Common { * Elements part of the sequence header, that are applicable for all the * frames in the video. */ - SequenceHeader seq_params; + SequenceHeader *seq_params; /*! * Current CDFs of all the symbols for the current frame. @@ -982,7 +1005,7 @@ typedef struct AV1Common { CommonContexts above_contexts; /** - * \name Signaled when cm->seq_params.frame_id_numbers_present_flag == 1 + * \name Signaled when cm->seq_params->frame_id_numbers_present_flag == 1 */ /**@{*/ int current_frame_id; /*!< frame ID for the current frame. */ @@ -1014,20 +1037,12 @@ typedef struct AV1Common { int8_t ref_frame_side[REF_FRAMES]; /*! - * Number of temporal layers: may be > 1 for SVC (scalable vector coding). - */ - unsigned int number_temporal_layers; - /*! * Temporal layer ID of this frame * (in the range 0 ... (number_temporal_layers - 1)). */ int temporal_layer_id; /*! - * Number of spatial layers: may be > 1 for SVC (scalable vector coding). - */ - unsigned int number_spatial_layers; - /*! * Spatial layer ID of this frame * (in the range 0 ... (number_spatial_layers - 1)). */ @@ -1192,15 +1207,15 @@ static INLINE RefCntBuffer *get_primary_ref_frame_buf( // Returns 1 if this frame might allow mvs from some reference frame. static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) { return !cm->features.error_resilient_mode && - cm->seq_params.order_hint_info.enable_ref_frame_mvs && - cm->seq_params.order_hint_info.enable_order_hint && + cm->seq_params->order_hint_info.enable_ref_frame_mvs && + cm->seq_params->order_hint_info.enable_order_hint && !frame_is_intra_only(cm); } // Returns 1 if this frame might use warped_motion static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) { return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) && - cm->seq_params.enable_warped_motion; + cm->seq_params->enable_warped_motion; } static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { @@ -1240,7 +1255,7 @@ static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) { void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params); static INLINE int av1_num_planes(const AV1_COMMON *cm) { - return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE; + return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE; } static INLINE void av1_init_above_context(CommonContexts *above_contexts, @@ -1279,8 +1294,8 @@ static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd) { } } xd->mi_stride = cm->mi_params.mi_stride; - xd->error_info = &cm->error; - cfl_init(&xd->cfl, &cm->seq_params); + xd->error_info = cm->error; + cfl_init(&xd->cfl, cm->seq_params); } static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col, @@ -1562,7 +1577,7 @@ static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd, int mi_col_start, int mi_col_end, const int tile_row) { - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); const int width = mi_col_end - mi_col_start; const int aligned_width = diff --git a/third_party/libaom/source/libaom/av1/common/av1_loopfilter.c b/third_party/libaom/source/libaom/av1/common/av1_loopfilter.c index caa15c21e2..18ae0f28f4 100644 --- a/third_party/libaom/source/libaom/av1/common/av1_loopfilter.c +++ b/third_party/libaom/source/libaom/av1/common/av1_loopfilter.c @@ -351,8 +351,14 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm, const uint32_t scale_vert = plane_ptr->subsampling_y; uint8_t *const dst_ptr = plane_ptr->dst.buf; const int dst_stride = plane_ptr->dst.stride; - const int y_range = (MAX_MIB_SIZE >> scale_vert); - const int x_range = (MAX_MIB_SIZE >> scale_horz); + const int plane_mi_rows = + ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); + const int plane_mi_cols = + ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); + const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + (MAX_MIB_SIZE >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); for (int y = 0; y < y_range; y++) { uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; for (int x = 0; x < x_range;) { @@ -376,8 +382,8 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm, } #if CONFIG_AV1_HIGHBITDEPTH - const int use_highbitdepth = cm->seq_params.use_highbitdepth; - const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; + const int use_highbitdepth = cm->seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; switch (params.filter_length) { // apply 4-tap filtering case 4: @@ -456,6 +462,84 @@ void av1_filter_block_plane_vert(const AV1_COMMON *const cm, } } +void av1_filter_block_plane_vert_rt(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, + const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int plane_mi_rows = + ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); + const int plane_mi_cols = + ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); + const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + (MAX_MIB_SIZE >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + assert(!plane); + assert(!(y_range % 2)); + for (int y = 0; y < y_range; y += 2) { + uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride; + for (int x = 0; x < x_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 transform is used, it will then filter the internal edge + // aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = + set_lpf_parameters(¶ms, ((ptrdiff_t)1 << scale_horz), cm, xd, + VERT_EDGE, curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + switch (params.filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_vertical_4_dual(p, dst_stride, params.mblim, params.lim, + params.hev_thr, params.mblim, params.lim, + params.hev_thr); + break; + case 6: // apply 6-tap filter for chroma plane only + assert(plane != 0); + aom_lpf_vertical_6_dual(p, dst_stride, params.mblim, params.lim, + params.hev_thr, params.mblim, params.lim, + params.hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_vertical_8_dual(p, dst_stride, params.mblim, params.lim, + params.hev_thr, params.mblim, params.lim, + params.hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_vertical_14_dual(p, dst_stride, params.mblim, params.lim, + params.hev_thr, params.mblim, params.lim, + params.hev_thr); + break; + // no filtering + default: break; + } + // advance the destination pointer + advance_units = tx_size_wide_unit[tx_size]; + x += advance_units; + p += advance_units * MI_SIZE; + } + } +} + void av1_filter_block_plane_horz(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const int plane, const MACROBLOCKD_PLANE *const plane_ptr, @@ -464,8 +548,14 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, const uint32_t scale_vert = plane_ptr->subsampling_y; uint8_t *const dst_ptr = plane_ptr->dst.buf; const int dst_stride = plane_ptr->dst.stride; - const int y_range = (MAX_MIB_SIZE >> scale_vert); - const int x_range = (MAX_MIB_SIZE >> scale_horz); + const int plane_mi_rows = + ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); + const int plane_mi_cols = + ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); + const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + (MAX_MIB_SIZE >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); for (int x = 0; x < x_range; x++) { uint8_t *p = dst_ptr + x * MI_SIZE; for (int y = 0; y < y_range;) { @@ -489,8 +579,8 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, } #if CONFIG_AV1_HIGHBITDEPTH - const int use_highbitdepth = cm->seq_params.use_highbitdepth; - const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth; + const int use_highbitdepth = cm->seq_params->use_highbitdepth; + const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth; switch (params.filter_length) { // apply 4-tap filtering case 4: @@ -572,6 +662,84 @@ void av1_filter_block_plane_horz(const AV1_COMMON *const cm, } } +void av1_filter_block_plane_horz_rt(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, + const uint32_t mi_col) { + const uint32_t scale_horz = plane_ptr->subsampling_x; + const uint32_t scale_vert = plane_ptr->subsampling_y; + uint8_t *const dst_ptr = plane_ptr->dst.buf; + const int dst_stride = plane_ptr->dst.stride; + const int plane_mi_rows = + ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert); + const int plane_mi_cols = + ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz); + const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)), + (MAX_MIB_SIZE >> scale_vert)); + const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)), + (MAX_MIB_SIZE >> scale_horz)); + assert(!plane); + for (int x = 0; x < x_range; x += 2) { + uint8_t *p = dst_ptr + x * MI_SIZE; + for (int y = 0; y < y_range;) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 transform is used, it will then filter the internal + // edge aligned with a 4x4 block + const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE; + const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE; + uint32_t advance_units; + TX_SIZE tx_size; + AV1_DEBLOCKING_PARAMETERS params; + memset(¶ms, 0, sizeof(params)); + + tx_size = set_lpf_parameters( + ¶ms, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE, + curr_x, curr_y, plane, plane_ptr); + if (tx_size == TX_INVALID) { + params.filter_length = 0; + tx_size = TX_4X4; + } + + switch (params.filter_length) { + // apply 4-tap filtering + case 4: + aom_lpf_horizontal_4_dual(p, dst_stride, params.mblim, params.lim, + params.hev_thr, params.mblim, params.lim, + params.hev_thr); + break; + // apply 6-tap filtering + case 6: + assert(plane != 0); + aom_lpf_horizontal_6_dual(p, dst_stride, params.mblim, params.lim, + params.hev_thr, params.mblim, params.lim, + params.hev_thr); + break; + // apply 8-tap filtering + case 8: + aom_lpf_horizontal_8_dual(p, dst_stride, params.mblim, params.lim, + params.hev_thr, params.mblim, params.lim, + params.hev_thr); + break; + // apply 14-tap filtering + case 14: + aom_lpf_horizontal_14_dual(p, dst_stride, params.mblim, params.lim, + params.hev_thr, params.mblim, params.lim, + params.hev_thr); + break; + // no filtering + default: break; + } + // advance the destination pointer + advance_units = tx_size_high_unit[tx_size]; + y += advance_units; + p += advance_units * dst_stride * MI_SIZE; + } + } +} + void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, const int plane, @@ -661,7 +829,7 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, #if CONFIG_LPF_MASK int is_decoding, #endif - int plane_start, int plane_end) { + int plane_start, int plane_end, int is_realtime) { struct macroblockd_plane *pd = xd->plane; const int col_start = 0; const int col_end = cm->mi_params.mi_cols; @@ -679,7 +847,7 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, else if (plane == 2 && !(cm->lf.filter_level_v)) continue; - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0, + av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, 0, 0, plane, plane + 1); av1_build_bitmask_vert_info(cm, &pd[plane], plane); @@ -716,49 +884,106 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, continue; else if (plane == 2 && !(cm->lf.filter_level_v)) continue; - if (cm->lf.combine_vert_horz_lf) { // filter all vertical and horizontal edges in every 128x128 super block for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) { // filter vertical edges - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, - mi_col, plane, plane + 1); + av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, + mi_row, mi_col, plane, plane + 1); +#if CONFIG_AV1_HIGHBITDEPTH + (void)is_realtime; av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row, mi_col); +#else + if (is_realtime && !plane) { + av1_filter_block_plane_vert_rt(cm, xd, plane, &pd[plane], mi_row, + mi_col); + + } else { + av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row, + mi_col); + } +#endif // filter horizontal edges if (mi_col - MAX_MIB_SIZE >= 0) { - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, + av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, mi_row, mi_col - MAX_MIB_SIZE, plane, plane + 1); +#if CONFIG_AV1_HIGHBITDEPTH + (void)is_realtime; av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, mi_col - MAX_MIB_SIZE); +#else + if (is_realtime && !plane) { + av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row, + mi_col - MAX_MIB_SIZE); + } else { + av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, + mi_col - MAX_MIB_SIZE); + } +#endif } } // filter horizontal edges - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, + av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, mi_row, mi_col - MAX_MIB_SIZE, plane, plane + 1); +#if CONFIG_AV1_HIGHBITDEPTH + (void)is_realtime; av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, mi_col - MAX_MIB_SIZE); +#else + if (is_realtime && !plane) { + av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row, + mi_col - MAX_MIB_SIZE); + + } else { + av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, + mi_col - MAX_MIB_SIZE); + } +#endif } } else { // filter all vertical edges in every 128x128 super block for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) { - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, - mi_col, plane, plane + 1); + av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, + mi_row, mi_col, plane, plane + 1); +#if CONFIG_AV1_HIGHBITDEPTH + (void)is_realtime; av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row, mi_col); +#else + if (is_realtime && !plane) { + av1_filter_block_plane_vert_rt(cm, xd, plane, &pd[plane], mi_row, + mi_col); + } else { + av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row, + mi_col); + } +#endif } } // filter all horizontal edges in every 128x128 super block for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) { - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row, - mi_col, plane, plane + 1); + av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, + mi_row, mi_col, plane, plane + 1); +#if CONFIG_AV1_HIGHBITDEPTH + (void)is_realtime; av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, mi_col); +#else + if (is_realtime && !plane) { + av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row, + mi_col); + + } else { + av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row, + mi_col); + } +#endif } } } @@ -770,7 +995,8 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, #if CONFIG_LPF_MASK int is_decoding, #endif - int plane_start, int plane_end, int partial_frame) { + int plane_start, int plane_end, int partial_frame, + int is_realtime) { int start_mi_row, end_mi_row, mi_rows_to_filter; start_mi_row = 0; @@ -786,5 +1012,5 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, #if CONFIG_LPF_MASK is_decoding, #endif - plane_start, plane_end); + plane_start, plane_end, is_realtime); } diff --git a/third_party/libaom/source/libaom/av1/common/av1_loopfilter.h b/third_party/libaom/source/libaom/av1/common/av1_loopfilter.h index ca16bbe614..ed4453b2a7 100644 --- a/third_party/libaom/source/libaom/av1/common/av1_loopfilter.h +++ b/third_party/libaom/source/libaom/av1/common/av1_loopfilter.h @@ -151,7 +151,7 @@ void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, #else void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, struct macroblockd *xd, int plane_start, - int plane_end, int partial_frame); + int plane_end, int partial_frame, int is_realtime); #endif void av1_filter_block_plane_vert(const struct AV1Common *const cm, @@ -164,6 +164,20 @@ void av1_filter_block_plane_horz(const struct AV1Common *const cm, const MACROBLOCKD_PLANE *const plane_ptr, const uint32_t mi_row, const uint32_t mi_col); +void av1_filter_block_plane_vert_rt(const struct AV1Common *const cm, + const MACROBLOCKD *const xd, + const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, + const uint32_t mi_col); + +void av1_filter_block_plane_horz_rt(const struct AV1Common *const cm, + const MACROBLOCKD *const xd, + const int plane, + const MACROBLOCKD_PLANE *const plane_ptr, + const uint32_t mi_row, + const uint32_t mi_col); + uint8_t av1_get_filter_level(const struct AV1Common *cm, const loop_filter_info_n *lfi_n, const int dir_idx, int plane, const MB_MODE_INFO *mbmi); diff --git a/third_party/libaom/source/libaom/av1/common/blockd.h b/third_party/libaom/source/libaom/av1/common/blockd.h index 1d1c381bca..5e535add2d 100644 --- a/third_party/libaom/source/libaom/av1/common/blockd.h +++ b/third_party/libaom/source/libaom/av1/common/blockd.h @@ -194,11 +194,6 @@ typedef struct RD_STATS { int zero_rate; #if CONFIG_RD_DEBUG int txb_coeff_cost[MAX_MB_PLANE]; - // TODO(jingning): Temporary solution to silence stack over-size warning - // in handle_inter_mode. This should be fixed after rate-distortion - // optimization refactoring. - int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE] - [TXB_COEFF_COST_MAP_SIZE]; #endif // CONFIG_RD_DEBUG } RD_STATS; @@ -325,6 +320,9 @@ typedef struct MB_MODE_INFO { int8_t cdef_strength : 4; /**@}*/ + /*! \brief Skip CDEF for this superblock */ + uint8_t skip_cdef_curr_sb; + #if CONFIG_RD_DEBUG /*! \brief RD info used for debugging */ RD_STATS rd_stats; @@ -552,10 +550,6 @@ typedef struct cfl_ctx { // Whether the reconstructed luma pixels need to be stored int store_y; - -#if CONFIG_DEBUG - int rate; -#endif // CONFIG_DEBUG } CFL_CTX; typedef struct dist_wtd_comp_params { @@ -810,7 +804,7 @@ typedef struct macroblockd { FRAME_CONTEXT *tile_ctx; /*! - * Bit depth: copied from cm->seq_params.bit_depth for convenience. + * Bit depth: copied from cm->seq_params->bit_depth for convenience. */ int bd; @@ -893,7 +887,7 @@ typedef struct macroblockd { /*! * Mask for this block used for compound prediction. */ - DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); + uint8_t *seg_mask; /*! * CFL (chroma from luma) related parameters. @@ -937,13 +931,42 @@ typedef struct macroblockd { /*!\cond */ static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) { +#if CONFIG_AV1_HIGHBITDEPTH return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0; +#else + (void)xd; + return 0; +#endif } static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) { +#if CONFIG_AV1_HIGHBITDEPTH return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? CONVERT_TO_BYTEPTR(buf16) : buf16; +#else + (void)xd; + return buf16; +#endif +} + +typedef struct BitDepthInfo { + int bit_depth; + /*! Is the image buffer high bit depth? + * Low bit depth buffer uses uint8_t. + * High bit depth buffer uses uint16_t. + * Equivalent to cm->seq_params->use_highbitdepth + */ + int use_highbitdepth_buf; +} BitDepthInfo; + +static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) { + BitDepthInfo bit_depth_info; + bit_depth_info.bit_depth = xd->bd; + bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd); + assert(IMPLIES(!bit_depth_info.use_highbitdepth_buf, + bit_depth_info.bit_depth == 8)); + return bit_depth_info; } static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) { diff --git a/third_party/libaom/source/libaom/av1/common/cdef.c b/third_party/libaom/source/libaom/av1/common/cdef.c index d9b5a104e4..9ab7d4d235 100644 --- a/third_party/libaom/source/libaom/av1/common/cdef.c +++ b/third_party/libaom/source/libaom/av1/common/cdef.c @@ -21,35 +21,6 @@ #include "av1/common/cdef_block.h" #include "av1/common/reconinter.h" -enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY); - -/*!\brief Parameters related to CDEF Block */ -typedef struct { - uint16_t *src; - uint8_t *dst; - uint16_t *colbuf[MAX_MB_PLANE]; - cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; - - int xdec; - int ydec; - int mi_wide_l2; - int mi_high_l2; - int frame_boundary[BOUNDARIES]; - - int damping; - int coeff_shift; - int level; - int sec_strength; - int cdef_count; - int is_zero_level; - int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]; - int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; - - int dst_stride; - int coffset; - int roffset; -} CdefBlockInfo; - static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col, int mi_stride) { MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col; @@ -116,10 +87,10 @@ void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, } } -static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride, - const uint8_t *src, int src_voffset, int src_hoffset, - int sstride, int vsize, int hsize) { - if (cm->seq_params.use_highbitdepth) { +void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst, + int dstride, const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, int hsize) { + if (cm->seq_params->use_highbitdepth) { const uint16_t *base = &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize); @@ -151,29 +122,35 @@ static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src, // Inputs: // cm: Pointer to common structure. // fb_info: Pointer to the CDEF block-level parameter structure. -// linebuf: Top feedback buffer for CDEF. +// colbuf: Left column buffer for CDEF. // cdef_left: Left block is filtered or not. // fbc, fbr: col and row index of a block. // plane: plane index Y/CB/CR. -// prev_row_cdef: Top blocks are filtered or not. // Returns: // Nothing will be returned. -static void cdef_prepare_fb(AV1_COMMON *cm, CdefBlockInfo *fb_info, - uint16_t **linebuf, const int *cdef_left, int fbc, - int fbr, uint8_t plane, - unsigned char *prev_row_cdef) { +static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info, + uint16_t **const colbuf, const int *cdef_left, + int fbc, int fbr, int plane) { const CommonModeInfoParams *const mi_params = &cm->mi_params; uint16_t *src = fb_info->src; - const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER; + const int luma_stride = + ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4); const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; int cstart = 0; if (!*cdef_left) cstart = -CDEF_HBORDER; int rend, cend; - int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); - int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); - int hsize = nhb << fb_info->mi_wide_l2; - int vsize = nvb << fb_info->mi_high_l2; + const int nhb = + AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc); + const int nvb = + AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr); + const int hsize = nhb << fb_info->mi_wide_l2; + const int vsize = nvb << fb_info->mi_high_l2; + const uint16_t *top_linebuf = fb_info->top_linebuf[plane]; + const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane]; + const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE; + const int stride = + luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x); if (fbc == nhfb - 1) cend = hsize; @@ -185,54 +162,55 @@ static void cdef_prepare_fb(AV1_COMMON *cm, CdefBlockInfo *fb_info, else rend = vsize + CDEF_VBORDER; - if (fbc == nhfb - 1) { - /* On the last superblock column, fill in the right border with - CDEF_VERY_LARGE to avoid filtering with the outside. */ - fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE, rend + CDEF_VBORDER, - hsize + CDEF_HBORDER - cend, CDEF_VERY_LARGE); - } - if (fbr == nvfb - 1) { - /* On the last superblock row, fill in the bottom border with - CDEF_VERY_LARGE to avoid filtering with the outside. */ - fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE, - CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE); - } /* Copy in the pixels we need from the current superblock for deringing.*/ - copy_sb8_16(cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart], - CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, - fb_info->coffset + cstart, fb_info->dst_stride, rend, - cend - cstart); - if (!prev_row_cdef[fbc]) { - copy_sb8_16(cm, &src[CDEF_HBORDER], CDEF_BSTRIDE, fb_info->dst, - fb_info->roffset - CDEF_VBORDER, fb_info->coffset, - fb_info->dst_stride, CDEF_VBORDER, hsize); - } else if (fbr > 0) { - copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, - &linebuf[plane][fb_info->coffset], stride, CDEF_VBORDER, hsize); + av1_cdef_copy_sb8_16( + cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart], + CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart, + fb_info->dst_stride, vsize, cend - cstart); + + /* Copy in the pixels we need for the current superblock from bottom buffer.*/ + if (fbr < nvfb - 1) { + copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize); + } else { + fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, + hsize, CDEF_VERY_LARGE); + } + if (fbr < nvfb - 1 && fbc > 0) { + copy_rect(&src[bot_offset], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride, + CDEF_VBORDER, CDEF_HBORDER); + } else { + fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, + CDEF_VERY_LARGE); + } + if (fbr < nvfb - 1 && fbc < nhfb - 1) { + copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE, + &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER, + CDEF_HBORDER); + } else { + fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE, + CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); + } + + /* Copy in the pixels we need from the current superblock from top buffer.*/ + if (fbr > 0) { + copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset], + stride, CDEF_VBORDER, hsize); } else { fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize, CDEF_VERY_LARGE); } - if (!prev_row_cdef[fbc - 1]) { - copy_sb8_16(cm, src, CDEF_BSTRIDE, fb_info->dst, - fb_info->roffset - CDEF_VBORDER, - fb_info->coffset - CDEF_HBORDER, fb_info->dst_stride, - CDEF_VBORDER, CDEF_HBORDER); - } else if (fbr > 0 && fbc > 0) { - copy_rect(src, CDEF_BSTRIDE, - &linebuf[plane][fb_info->coffset - CDEF_HBORDER], stride, - CDEF_VBORDER, CDEF_HBORDER); + if (fbr > 0 && fbc > 0) { + copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER], + stride, CDEF_VBORDER, CDEF_HBORDER); } else { fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } - if (!prev_row_cdef[fbc + 1]) { - copy_sb8_16(cm, &src[CDEF_HBORDER + hsize], CDEF_BSTRIDE, fb_info->dst, - fb_info->roffset - CDEF_VBORDER, fb_info->coffset + hsize, - fb_info->dst_stride, CDEF_VBORDER, CDEF_HBORDER); - } else if (fbr > 0 && fbc < nhfb - 1) { + if (fbr > 0 && fbc < nhfb - 1) { copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, - &linebuf[plane][fb_info->coffset + hsize], stride, CDEF_VBORDER, + &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER, CDEF_HBORDER); } else { fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, @@ -241,36 +219,25 @@ static void cdef_prepare_fb(AV1_COMMON *cm, CdefBlockInfo *fb_info, if (*cdef_left) { /* If we deringed the superblock on the left then we need to copy in saved pixels. */ - copy_rect(src, CDEF_BSTRIDE, fb_info->colbuf[plane], CDEF_HBORDER, + copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER, rend + CDEF_VBORDER, CDEF_HBORDER); } /* Saving pixels in case we need to dering the superblock on the right. */ - copy_rect(fb_info->colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE, + copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE, rend + CDEF_VBORDER, CDEF_HBORDER); - copy_sb8_16(cm, &linebuf[plane][fb_info->coffset], stride, fb_info->dst, - (MI_SIZE_64X64 << fb_info->mi_high_l2) * (fbr + 1) - CDEF_VBORDER, - fb_info->coffset, fb_info->dst_stride, CDEF_VBORDER, hsize); - if (fb_info->frame_boundary[TOP]) { - fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, - CDEF_VERY_LARGE); - } if (fb_info->frame_boundary[LEFT]) { fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } - if (fb_info->frame_boundary[BOTTOM]) { - fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE, - CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE); - } if (fb_info->frame_boundary[RIGHT]) { fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE); } } -static INLINE void cdef_filter_fb(CdefBlockInfo *fb_info, uint8_t plane, +static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane, uint8_t use_highbitdepth) { int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset; if (use_highbitdepth) { @@ -291,11 +258,11 @@ static INLINE void cdef_filter_fb(CdefBlockInfo *fb_info, uint8_t plane, } // Initializes block-level parameters for CDEF. -static INLINE void cdef_init_fb_col(MACROBLOCKD *xd, +static INLINE void cdef_init_fb_col(const MACROBLOCKD *const xd, const CdefInfo *const cdef_info, - CdefBlockInfo *fb_info, - const int mbmi_cdef_strength, int fbc, - int fbr, uint8_t plane) { + CdefBlockInfo *const fb_info, + int mbmi_cdef_strength, int fbc, int fbr, + int plane) { if (plane == AOM_PLANE_Y) { fb_info->level = cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS; @@ -328,9 +295,9 @@ static INLINE void cdef_init_fb_col(MACROBLOCKD *xd, fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2; } -static bool cdef_fb_col(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info, - int fbc, int fbr, int *cdef_left, uint16_t **linebuf, - unsigned char *prev_row_cdef) { +static void cdef_fb_col(const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, uint16_t **const colbuf, + int *cdef_left, int fbc, int fbr) { const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mbmi_cdef_strength = mi_params @@ -343,9 +310,9 @@ static bool cdef_fb_col(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info, MI_SIZE_64X64 * fbc] == NULL || mbmi_cdef_strength == -1) { *cdef_left = 0; - return 0; + return; } - for (uint8_t plane = 0; plane < num_planes; plane++) { + for (int plane = 0; plane < num_planes; plane++) { cdef_init_fb_col(xd, &cm->cdef_info, fb_info, mbmi_cdef_strength, fbc, fbr, plane); if (fb_info->is_zero_level || @@ -353,20 +320,26 @@ static bool cdef_fb_col(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info, mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, fb_info->dlist, BLOCK_64X64)) == 0) { *cdef_left = 0; - return 0; + return; } - cdef_prepare_fb(cm, fb_info, linebuf, cdef_left, fbc, fbr, plane, - prev_row_cdef); - cdef_filter_fb(fb_info, plane, cm->seq_params.use_highbitdepth); + cdef_prepare_fb(cm, fb_info, colbuf, cdef_left, fbc, fbr, plane); + cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth); } *cdef_left = 1; - return 1; } -static INLINE void cdef_init_fb_row(CdefBlockInfo *fb_info, int mi_rows, - int fbr) { - const int nvfb = (mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; - +// Initializes row-level parameters for CDEF frame. +void av1_cdef_init_fb_row(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr) { + (void)cdef_sync; + const int num_planes = av1_num_planes(cm); + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + const bool ping_pong = fbr & 1; // for the current filter block, it's top left corner mi structure (mi_tl) // is first accessed to check whether the top and left boundaries are // frame boundaries. Then bottom-left and top-right mi structures are @@ -379,78 +352,58 @@ static INLINE void cdef_init_fb_row(CdefBlockInfo *fb_info, int mi_rows, fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0; if (fbr != nvfb - 1) fb_info->frame_boundary[BOTTOM] = - (MI_SIZE_64X64 * (fbr + 1) == mi_rows) ? 1 : 0; + (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0; else fb_info->frame_boundary[BOTTOM] = 1; + + fb_info->src = src; + fb_info->damping = cm->cdef_info.cdef_damping; + fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); + av1_zero(fb_info->dir); + av1_zero(fb_info->var); + + for (int plane = 0; plane < num_planes; plane++) { + const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + const int stride = luma_stride >> xd->plane[plane].subsampling_x; + // here ping-pong buffers are maintained for top linebuf + // to avoid linebuf over-write by consecutive row. + uint16_t *const top_linebuf = + &linebuf[plane][ping_pong * CDEF_VBORDER * stride]; + fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride]; + + if (fbr != nvfb - 1) // top line buffer copy + av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf, + offset - CDEF_VBORDER, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + fb_info->top_linebuf[plane] = + &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride]; + + if (fbr != nvfb - 1) // bottom line buffer copy + av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride, + xd->plane[plane].dst.buf, offset, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + } } -static void cdef_fb_row(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info, - uint16_t **linebuf, int fbr, - unsigned char *curr_row_cdef, - unsigned char *prev_row_cdef) { +void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd, + uint16_t **const linebuf, uint16_t **const colbuf, + uint16_t *const src, int fbr, + cdef_init_fb_row_t cdef_init_fb_row_fn, + struct AV1CdefSyncData *const cdef_sync) { + CdefBlockInfo fb_info; int cdef_left = 1; const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; - cdef_init_fb_row(fb_info, cm->mi_params.mi_rows, fbr); + cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr); for (int fbc = 0; fbc < nhfb; fbc++) { - fb_info->frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0; + fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0; if (fbc != nhfb - 1) - fb_info->frame_boundary[RIGHT] = + fb_info.frame_boundary[RIGHT] = (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0; else - fb_info->frame_boundary[RIGHT] = 1; - curr_row_cdef[fbc] = cdef_fb_col(cm, xd, fb_info, fbc, fbr, &cdef_left, - linebuf, prev_row_cdef); - } -} - -// Initialize the frame-level CDEF parameters. -// Inputs: -// frame: Pointer to input frame buffer. -// cm: Pointer to common structure. -// xd: Pointer to common current coding block structure. -// fb_info: Pointer to the CDEF block-level parameter structure. -// src: Intermediate input buffer for CDEF. -// colbuf: Left feedback buffer for CDEF. -// linebuf: Top feedback buffer for CDEF. -// Returns: -// Nothing will be returned. -static void cdef_prepare_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, - MACROBLOCKD *xd, CdefBlockInfo *fb_info, - uint16_t *src, uint16_t **colbuf, - uint16_t **linebuf) { - const int num_planes = av1_num_planes(cm); - const int stride = (cm->mi_params.mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER; - av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, - num_planes); - - for (uint8_t plane = 0; plane < num_planes; plane++) { - linebuf[plane] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride); - const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; - const int block_height = (MI_SIZE_64X64 << mi_high_l2) + 2 * CDEF_VBORDER; - colbuf[plane] = aom_malloc( - sizeof(*colbuf) * - ((CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) + - 2 * CDEF_VBORDER) * - CDEF_HBORDER); - fill_rect(colbuf[plane], CDEF_HBORDER, block_height, CDEF_HBORDER, - CDEF_VERY_LARGE); - fb_info->colbuf[plane] = colbuf[plane]; - } - - fb_info->src = src; - fb_info->damping = cm->cdef_info.cdef_damping; - fb_info->coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0); - memset(fb_info->dir, 0, sizeof(fb_info->dir)); - memset(fb_info->var, 0, sizeof(fb_info->var)); -} - -static void cdef_free(unsigned char *row_cdef, uint16_t **colbuf, - uint16_t **linebuf, const int num_planes) { - aom_free(row_cdef); - for (uint8_t plane = 0; plane < num_planes; plane++) { - aom_free(colbuf[plane]); - aom_free(linebuf[plane]); + fb_info.frame_boundary[RIGHT] = 1; + cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left, fbc, fbr); } } @@ -461,29 +414,15 @@ static void cdef_free(unsigned char *row_cdef, uint16_t **colbuf, // xd: Pointer to common current coding block structure. // Returns: // Nothing will be returned. -void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, - MACROBLOCKD *xd) { - DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]); - uint16_t *colbuf[MAX_MB_PLANE] = { NULL }; - uint16_t *linebuf[MAX_MB_PLANE] = { NULL }; - CdefBlockInfo fb_info; - unsigned char *row_cdef, *prev_row_cdef, *curr_row_cdef; +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm, + MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) { const int num_planes = av1_num_planes(cm); const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; - const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; - row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2); - memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2); - prev_row_cdef = row_cdef + 1; - curr_row_cdef = prev_row_cdef + nhfb + 2; - cdef_prepare_frame(frame, cm, xd, &fb_info, src, colbuf, linebuf); - - for (int fbr = 0; fbr < nvfb; fbr++) { - unsigned char *tmp; - cdef_fb_row(cm, xd, &fb_info, linebuf, fbr, curr_row_cdef, prev_row_cdef); - tmp = prev_row_cdef; - prev_row_cdef = curr_row_cdef; - curr_row_cdef = tmp; - } - cdef_free(row_cdef, colbuf, linebuf, num_planes); + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, + num_planes); + + for (int fbr = 0; fbr < nvfb; fbr++) + av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf, + cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL); } diff --git a/third_party/libaom/source/libaom/av1/common/cdef.h b/third_party/libaom/source/libaom/av1/common/cdef.h index 4d6e60023b..194117884e 100644 --- a/third_party/libaom/source/libaom/av1/common/cdef.h +++ b/third_party/libaom/source/libaom/av1/common/cdef.h @@ -23,6 +23,40 @@ #include "av1/common/av1_common_int.h" #include "av1/common/cdef_block.h" +enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY); + +struct AV1CdefSyncData; + +/*!\brief Parameters related to CDEF Block */ +typedef struct { + uint16_t *src; /*!< CDEF intermediate buffer */ + uint16_t *top_linebuf[MAX_MB_PLANE]; /*!< CDEF top line buffer */ + uint16_t *bot_linebuf[MAX_MB_PLANE]; /*!< CDEF bottom line buffer */ + uint8_t *dst; /*!< CDEF destination buffer */ + cdef_list + dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; /*!< CDEF 8x8 block positions */ + + int xdec; /*!< Sub-sampling X */ + int ydec; /*!< Sub-sampling X */ + int mi_wide_l2; /*!< Pixels per mi unit in width */ + int mi_high_l2; /*!< Pixels per mi unit in height */ + int frame_boundary[BOUNDARIES]; /*!< frame boundaries */ + + int damping; /*!< CDEF damping factor */ + int coeff_shift; /*!< Bit-depth based shift for calculating filter strength */ + int level; /*!< CDEF filtering level */ + int sec_strength; /*!< CDEF secondary strength */ + int cdef_count; /*!< Number of CDEF sub-blocks in superblock */ + int is_zero_level; /*!< CDEF filtering level ON/OFF */ + int dir[CDEF_NBLOCKS] + [CDEF_NBLOCKS]; /*!< CDEF filter direction for all 8x8 sub-blocks*/ + int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; /*!< variance for all 8x8 sub-blocks */ + + int dst_stride; /*!< CDEF destination buffer stride */ + int coffset; /*!< current superblock offset in a row */ + int roffset; /*!< current row offset */ +} CdefBlockInfo; + static INLINE int sign(int i) { return i < 0 ? -1 : 1; } static INLINE int constrain(int diff, int threshold, int damping) { @@ -41,19 +75,36 @@ int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params, int mi_row, int mi_col, cdef_list *dlist, BLOCK_SIZE bsize); +typedef void (*cdef_init_fb_row_t)( + const AV1_COMMON *const cm, const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); + /*!\brief Function for applying CDEF to a frame * * \ingroup in_loop_cdef * This function applies CDEF to a frame. * - * \param[in, out] frame Compressed frame buffer - * \param[in, out] cm Pointer to top level common structure - * \param[in] xd Pointer to common current coding block structure + * \param[in, out] frame Compressed frame buffer + * \param[in, out] cm Pointer to top level common structure + * \param[in] xd Pointer to common current coding block structure + * \param[in] cdef_init_fb_row_fn Function Pointer * * \return Nothing is returned. Instead, the filtered frame is output in * \c frame. */ -void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd); +void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm, + MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn); +void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd, + uint16_t **const linebuf, uint16_t **const colbuf, + uint16_t *const src, int fbr, + cdef_init_fb_row_t cdef_init_fb_row_fn, + struct AV1CdefSyncData *const cdef_sync); +void av1_cdef_init_fb_row(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); #ifdef __cplusplus } // extern "C" diff --git a/third_party/libaom/source/libaom/av1/common/cdef_block.h b/third_party/libaom/source/libaom/av1/common/cdef_block.h index 6b0ae0a9db..574df2d0de 100644 --- a/third_party/libaom/source/libaom/av1/common/cdef_block.h +++ b/third_party/libaom/source/libaom/av1/common/cdef_block.h @@ -19,8 +19,8 @@ #define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8) #define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2) -/* We need to buffer three vertical lines. */ -#define CDEF_VBORDER (3) +/* We need to buffer two vertical lines. */ +#define CDEF_VBORDER (2) /* We only need to buffer three horizontal pixels too, but let's align to 16 bytes (8 x 16 bits) to make vectorization easier. */ #define CDEF_HBORDER (8) diff --git a/third_party/libaom/source/libaom/av1/common/cfl.h b/third_party/libaom/source/libaom/av1/common/cfl.h index 0062e9f7ba..0d53764f28 100644 --- a/third_party/libaom/source/libaom/av1/common/cfl.h +++ b/third_party/libaom/source/libaom/av1/common/cfl.h @@ -39,7 +39,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm, const MACROBLOCKD *xd) { const MB_MODE_INFO *mbmi = xd->mi[0]; - if (cm->seq_params.monochrome) return CFL_DISALLOWED; + if (cm->seq_params->monochrome) return CFL_DISALLOWED; if (!xd->is_chroma_ref) { // For non-chroma-reference blocks, we should always store the luma pixels, diff --git a/third_party/libaom/source/libaom/av1/common/common.h b/third_party/libaom/source/libaom/av1/common/common.h index bed6083db2..cc2da98a16 100644 --- a/third_party/libaom/source/libaom/av1/common/common.h +++ b/third_party/libaom/source/libaom/av1/common/common.h @@ -50,7 +50,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } #define CHECK_MEM_ERROR(cm, lval, expr) \ - AOM_CHECK_MEM_ERROR(&cm->error, lval, expr) + AOM_CHECK_MEM_ERROR(cm->error, lval, expr) #define AOM_FRAME_MARKER 0x2 diff --git a/third_party/libaom/source/libaom/av1/common/common_data.h b/third_party/libaom/source/libaom/av1/common/common_data.h index 402845cafe..38e14714c0 100644 --- a/third_party/libaom/source/libaom/av1/common/common_data.h +++ b/third_party/libaom/source/libaom/av1/common/common_data.h @@ -434,9 +434,12 @@ static const int intra_mode_context[INTRA_MODES] = { static const int quant_dist_weight[4][2] = { { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE } }; -static const int quant_dist_lookup_table[2][4][2] = { - { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } }, - { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } }, + +static const int quant_dist_lookup_table[4][2] = { + { 9, 7 }, + { 11, 5 }, + { 12, 4 }, + { 13, 3 }, }; #ifdef __cplusplus diff --git a/third_party/libaom/source/libaom/av1/common/enums.h b/third_party/libaom/source/libaom/av1/common/enums.h index 9c2976b08d..0e1e744daf 100644 --- a/third_party/libaom/source/libaom/av1/common/enums.h +++ b/third_party/libaom/source/libaom/av1/common/enums.h @@ -321,6 +321,7 @@ enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE); #define CFL_ALPHABET_SIZE_LOG2 4 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2) #define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1) +#define CFL_INDEX_ZERO CFL_ALPHABET_SIZE #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2) #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1)) @@ -451,6 +452,14 @@ enum { UV_MODE_INVALID, // For uv_mode in inter blocks } UENUM1BYTE(UV_PREDICTION_MODE); +// Number of top model rd to store for pruning y modes in intra mode decision +#define TOP_INTRA_MODEL_COUNT 4 +// Total number of luma intra prediction modes (include both directional and +// non-directional modes) +// 61 = PAETH_PRED - DC_PRED + 1 + 6 * 8 +// Because there are 8 directional modes, each has additional 6 delta angles. +#define LUMA_MODE_COUNT 61 + enum { SIMPLE_TRANSLATION, OBMC_CAUSAL, // 2-sided OBMC diff --git a/third_party/libaom/source/libaom/av1/common/loopfiltermask.c b/third_party/libaom/source/libaom/av1/common/loopfiltermask.c index 1ae0b112ce..22ab0adf2c 100644 --- a/third_party/libaom/source/libaom/av1/common/loopfiltermask.c +++ b/third_party/libaom/source/libaom/av1/common/loopfiltermask.c @@ -1002,11 +1002,11 @@ void av1_filter_block_plane_bitmask_vert( } #if CONFIG_AV1_HIGHBITDEPTH - if (cm->seq_params.use_highbitdepth) + if (cm->seq_params->use_highbitdepth) highbd_filter_selectively_vert_row2( ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, - &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); + &cm->lf_info, lfl, lfl2, (int)cm->seq_params->bit_depth); else filter_selectively_vert_row2( ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, @@ -1075,10 +1075,11 @@ void av1_filter_block_plane_bitmask_horz( mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; #if CONFIG_AV1_HIGHBITDEPTH - if (cm->seq_params.use_highbitdepth) - highbd_filter_selectively_horiz( - CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16, - mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth); + if (cm->seq_params->use_highbitdepth) + highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, pl, ssx, mask_16x16, + mask_8x8, mask_4x4, &cm->lf_info, lfl, + (int)cm->seq_params->bit_depth); else filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, mask_8x8, mask_4x4, &cm->lf_info, lfl); @@ -1109,10 +1110,10 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm, uint8_t *lfl2; // filter two rows at a time - for (r = 0; r < cm->seq_params.mib_size && + for (r = 0; r < cm->seq_params->mib_size && ((mi_row + r) << MI_SIZE_LOG2 < cm->height); r += r_step) { - for (c = 0; c < cm->seq_params.mib_size && + for (c = 0; c < cm->seq_params->mib_size && ((mi_col + c) << MI_SIZE_LOG2 < cm->width); c += MI_SIZE_64X64) { dst->buf += ((c << MI_SIZE_LOG2) >> ssx); @@ -1159,11 +1160,11 @@ void av1_filter_block_plane_ver(AV1_COMMON *const cm, uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff; #if CONFIG_AV1_HIGHBITDEPTH - if (cm->seq_params.use_highbitdepth) + if (cm->seq_params->use_highbitdepth) highbd_filter_selectively_vert_row2( ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1, - &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth); + &cm->lf_info, lfl, lfl2, (int)cm->seq_params->bit_depth); else filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0, @@ -1194,10 +1195,10 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm, uint64_t mask_4x4 = 0; uint8_t *lfl; - for (r = 0; r < cm->seq_params.mib_size && + for (r = 0; r < cm->seq_params->mib_size && ((mi_row + r) << MI_SIZE_LOG2 < cm->height); r += r_step) { - for (c = 0; c < cm->seq_params.mib_size && + for (c = 0; c < cm->seq_params->mib_size && ((mi_col + c) << MI_SIZE_LOG2 < cm->width); c += MI_SIZE_64X64) { if (mi_row + r == 0) continue; @@ -1235,11 +1236,11 @@ void av1_filter_block_plane_hor(AV1_COMMON *const cm, mask_4x4 = (mask_4x4 >> shift) & mask_cutoff; #if CONFIG_AV1_HIGHBITDEPTH - if (cm->seq_params.use_highbitdepth) + if (cm->seq_params->use_highbitdepth) highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16, mask_8x8, mask_4x4, &cm->lf_info, lfl, - (int)cm->seq_params.bit_depth); + (int)cm->seq_params->bit_depth); else filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16, mask_8x8, mask_4x4, &cm->lf_info, lfl); @@ -1260,9 +1261,11 @@ void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col, const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size]; const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size]; const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize( - mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)]; + mbmi->bsize, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y)]; const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize( - mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)]; + mbmi->bsize, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y)]; const int is_square_transform_size = tx_size <= TX_64X64; int mask_id = 0; int offset = 0; @@ -1330,9 +1333,11 @@ void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col, const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size]; const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size]; const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize( - mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)]; + mbmi->bsize, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y)]; const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize( - mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)]; + mbmi->bsize, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y)]; const int is_square_transform_size = mbmi->tx_size <= TX_64X64; int mask_id = 0; int offset = 0; diff --git a/third_party/libaom/source/libaom/av1/common/mv.h b/third_party/libaom/source/libaom/av1/common/mv.h index be539e8201..3203bf7278 100644 --- a/third_party/libaom/source/libaom/av1/common/mv.h +++ b/third_party/libaom/source/libaom/av1/common/mv.h @@ -12,6 +12,8 @@ #ifndef AOM_AV1_COMMON_MV_H_ #define AOM_AV1_COMMON_MV_H_ +#include <stdlib.h> + #include "av1/common/common.h" #include "av1/common/common_data.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/libaom/source/libaom/av1/common/mvref_common.c b/third_party/libaom/source/libaom/av1/common/mvref_common.c index 04e050a691..3431e7d6ad 100644 --- a/third_party/libaom/source/libaom/av1/common/mvref_common.c +++ b/third_party/libaom/source/libaom/av1/common/mvref_common.c @@ -258,7 +258,7 @@ static AOM_INLINE void scan_blk_mbmi( static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row, int mi_col, int bs) { - const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size]; + const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; const int mask_row = mi_row & (sb_mi_size - 1); const int mask_col = mi_col & (sb_mi_size - 1); @@ -347,7 +347,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, const int cur_frame_index = cm->cur_frame->order_hint; const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]); const int frame0_index = buf_0->order_hint; - const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info, + const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info, cur_frame_index, frame0_index); int idx; const int allow_high_precision_mv = cm->features.allow_high_precision_mv; @@ -380,7 +380,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, // Process compound inter mode const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]); const int frame1_index = buf_1->order_hint; - const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info, + const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info, cur_frame_index, frame1_index); int_mv comp_refmv; get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv, @@ -838,7 +838,9 @@ void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv, void av1_setup_frame_buf_refs(AV1_COMMON *cm) { cm->cur_frame->order_hint = cm->current_frame.order_hint; cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint; - +#if CONFIG_FRAME_PARALLEL_ENCODE + cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level; +#endif // CONFIG_FRAME_PARALLEL_ENCODE MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); @@ -854,10 +856,10 @@ void av1_setup_frame_sign_bias(AV1_COMMON *cm) { MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); - if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) { + if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) { const int ref_order_hint = buf->order_hint; cm->ref_frame_sign_bias[ref_frame] = - (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint, + (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint, (int)cm->current_frame.order_hint) <= 0) ? 0 : 1; @@ -930,10 +932,10 @@ static int motion_field_projection(AV1_COMMON *cm, &start_frame_buf->ref_order_hints[0]; const int cur_order_hint = cm->cur_frame->order_hint; int start_to_current_frame_offset = get_relative_dist( - &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint); + &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint); for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) { - ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info, + ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info, start_frame_order_hint, ref_order_hints[rf - LAST_FRAME]); } @@ -981,7 +983,7 @@ static int motion_field_projection(AV1_COMMON *cm, } void av1_setup_motion_field(AV1_COMMON *cm) { - const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side)); if (!order_hint_info->enable_order_hint) return; @@ -1219,7 +1221,7 @@ uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts, } void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { - const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; + const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info; SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info; skip_mode_info->skip_mode_allowed = 0; @@ -1323,11 +1325,11 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, int lst_frame_sort_idx = -1; int gld_frame_sort_idx = -1; - assert(cm->seq_params.order_hint_info.enable_order_hint); - assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0); + assert(cm->seq_params->order_hint_info.enable_order_hint); + assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0); const int cur_order_hint = (int)cm->current_frame.order_hint; const int cur_frame_sort_idx = - 1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1; + 1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1; REF_FRAME_INFO ref_frame_info[REF_FRAMES]; int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 }; @@ -1349,7 +1351,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, ref_frame_info[i].sort_idx = (offset == -1) ? -1 : cur_frame_sort_idx + - get_relative_dist(&cm->seq_params.order_hint_info, + get_relative_dist(&cm->seq_params->order_hint_info, offset, cur_order_hint); assert(ref_frame_info[i].sort_idx >= -1); @@ -1360,11 +1362,11 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference // frames. if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests a look-ahead frame as LAST"); } if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests a look-ahead frame as GOLDEN"); } diff --git a/third_party/libaom/source/libaom/av1/common/pred_common.h b/third_party/libaom/source/libaom/av1/common/pred_common.h index 12bcce84f2..3db9dd69ef 100644 --- a/third_party/libaom/source/libaom/av1/common/pred_common.h +++ b/third_party/libaom/source/libaom/av1/common/pred_common.h @@ -107,9 +107,9 @@ static INLINE int get_comp_index_context(const AV1_COMMON *cm, if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; - int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info, + int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info, fwd_frame_index, cur_frame_index)); - int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info, + int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info, cur_frame_index, bck_frame_index)); const MB_MODE_INFO *const above_mi = xd->above_mbmi; diff --git a/third_party/libaom/source/libaom/av1/common/reconinter.c b/third_party/libaom/source/libaom/av1/common/reconinter.c index ad155b26ae..70f4c6d5ee 100644 --- a/third_party/libaom/source/libaom/av1/common/reconinter.c +++ b/third_party/libaom/source/libaom/av1/common/reconinter.c @@ -713,8 +713,8 @@ void av1_build_one_inter_predictor( } void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, - const MB_MODE_INFO *mbmi, int order_idx, - int *fwd_offset, int *bck_offset, + const MB_MODE_INFO *mbmi, int *fwd_offset, + int *bck_offset, int *use_dist_wtd_comp_avg, int is_compound) { assert(fwd_offset != NULL && bck_offset != NULL); @@ -734,18 +734,18 @@ void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint; if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint; - int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info, + int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info, fwd_frame_index, cur_frame_index)), 0, MAX_FRAME_DISTANCE); - int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info, + int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info, cur_frame_index, bck_frame_index)), 0, MAX_FRAME_DISTANCE); const int order = d0 <= d1; if (d0 == 0 || d1 == 0) { - *fwd_offset = quant_dist_lookup_table[order_idx][3][order]; - *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order]; + *fwd_offset = quant_dist_lookup_table[3][order]; + *bck_offset = quant_dist_lookup_table[3][1 - order]; return; } @@ -758,8 +758,8 @@ void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break; } - *fwd_offset = quant_dist_lookup_table[order_idx][i][order]; - *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order]; + *fwd_offset = quant_dist_lookup_table[i][order]; + *bck_offset = quant_dist_lookup_table[i][1 - order]; } // True if the following hold: @@ -911,7 +911,7 @@ static void build_inter_predictors_8x8_and_bigger( ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); av1_dist_wtd_comp_weight_assign( - cm, mi, 0, &inter_pred_params.conv_params.fwd_offset, + cm, mi, &inter_pred_params.conv_params.fwd_offset, &inter_pred_params.conv_params.bck_offset, &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound); @@ -1189,7 +1189,6 @@ void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd, void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1, uint8_t **dst_buf2) { -#if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { int len = sizeof(uint16_t); dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); @@ -1203,16 +1202,13 @@ void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1, dst_buf2[2] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len); } else { -#endif // CONFIG_AV1_HIGHBITDEPTH dst_buf1[0] = xd->tmp_obmc_bufs[0]; dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE; dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2; dst_buf2[0] = xd->tmp_obmc_bufs[1]; dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE; dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2; -#if CONFIG_AV1_HIGHBITDEPTH } -#endif // CONFIG_AV1_HIGHBITDEPTH } void av1_setup_build_prediction_by_above_pred( @@ -1363,10 +1359,12 @@ void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0); assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0); assert(xd->mi[0]->use_intrabc == 0); + const SequenceHeader *seq_params = cm->seq_params; - av1_predict_intra_block(cm, xd, pd->width, pd->height, - max_txsize_rect_lookup[plane_bsize], mode, 0, 0, - FILTER_INTRA_MODES, ctx->plane[plane], + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, pd->width, + pd->height, max_txsize_rect_lookup[plane_bsize], mode, + 0, 0, FILTER_INTRA_MODES, ctx->plane[plane], ctx->stride[plane], dst, dst_stride, 0, 0, plane); } diff --git a/third_party/libaom/source/libaom/av1/common/reconinter.h b/third_party/libaom/source/libaom/av1/common/reconinter.h index c8696160b6..056dc67d07 100644 --- a/third_party/libaom/source/libaom/av1/common/reconinter.h +++ b/third_party/libaom/source/libaom/av1/common/reconinter.h @@ -368,8 +368,8 @@ static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index, } void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, - const MB_MODE_INFO *mbmi, int order_idx, - int *fwd_offset, int *bck_offset, + const MB_MODE_INFO *mbmi, int *fwd_offset, + int *bck_offset, int *use_dist_wtd_comp_avg, int is_compound); diff --git a/third_party/libaom/source/libaom/av1/common/reconintra.c b/third_party/libaom/source/libaom/av1/common/reconintra.c index 0c01f92183..51b01786ea 100644 --- a/third_party/libaom/source/libaom/av1/common/reconintra.c +++ b/third_party/libaom/source/libaom/av1/common/reconintra.c @@ -193,7 +193,7 @@ static const uint8_t *get_has_tr_table(PARTITION_TYPE partition, return ret; } -static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, +static int has_top_right(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, int mi_col, int top_available, int right_available, PARTITION_TYPE partition, TX_SIZE txsz, int row_off, int col_off, int ss_x, int ss_y) { @@ -223,7 +223,7 @@ static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; const int bh_in_mi_log2 = mi_size_high_log2[bsize]; - const int sb_mi_size = mi_size_high[cm->seq_params.sb_size]; + const int sb_mi_size = mi_size_high[sb_size]; const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; @@ -378,7 +378,7 @@ static const uint8_t *get_has_bl_table(PARTITION_TYPE partition, return ret; } -static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, +static int has_bottom_left(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row, int mi_col, int bottom_available, int left_available, PARTITION_TYPE partition, TX_SIZE txsz, int row_off, int col_off, int ss_x, int ss_y) { @@ -415,7 +415,7 @@ static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row, const int bw_in_mi_log2 = mi_size_wide_log2[bsize]; const int bh_in_mi_log2 = mi_size_high_log2[bsize]; - const int sb_mi_size = mi_size_high[cm->seq_params.sb_size]; + const int sb_mi_size = mi_size_high[sb_size]; const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; @@ -971,7 +971,7 @@ static int is_smooth(const MB_MODE_INFO *mbmi, int plane) { } } -static int get_filt_type(const MACROBLOCKD *xd, int plane) { +static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) { int ab_sm, le_sm; if (plane == 0) { @@ -1144,11 +1144,11 @@ void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) { } #if CONFIG_AV1_HIGHBITDEPTH static void build_intra_predictors_high( - const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8, - int dst_stride, PREDICTION_MODE mode, int angle_delta, - FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, - int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px, - int n_bottomleft_px, int plane) { + const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, + PREDICTION_MODE mode, int angle_delta, FILTER_INTRA_MODE filter_intra_mode, + TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, + int n_left_px, int n_bottomleft_px, int intra_edge_filter_type, + int bit_depth) { int i; uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); @@ -1166,7 +1166,7 @@ static void build_intra_predictors_high( int p_angle = 0; const int is_dr_mode = av1_is_directional_mode(mode); const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; - int base = 128 << (xd->bd - 8); + int base = 128 << (bit_depth - 8); // The left_data, above_data buffers must be zeroed to fix some intermittent // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are @@ -1270,7 +1270,7 @@ static void build_intra_predictors_high( if (use_filter_intra) { highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col, - filter_intra_mode, xd->bd); + filter_intra_mode, bit_depth); return; } @@ -1280,61 +1280,57 @@ static void build_intra_predictors_high( if (!disable_edge_filter) { const int need_right = p_angle < 90; const int need_bottom = p_angle > 180; - const int filt_type = get_filt_type(xd, plane); if (p_angle != 90 && p_angle != 180) { const int ab_le = need_above_left ? 1 : 0; if (need_above && need_left && (txwpx + txhpx >= 24)) { filter_intra_edge_corner_high(above_row, left_col); } if (need_above && n_top_px > 0) { - const int strength = - intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type); + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); av1_filter_intra_edge_high(above_row - ab_le, n_px, strength); } if (need_left && n_left_px > 0) { const int strength = intra_edge_filter_strength( - txhpx, txwpx, p_angle - 180, filt_type); + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); av1_filter_intra_edge_high(left_col - ab_le, n_px, strength); } } - upsample_above = - av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type); + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); if (need_above && upsample_above) { const int n_px = txwpx + (need_right ? txhpx : 0); - av1_upsample_intra_edge_high(above_row, n_px, xd->bd); + av1_upsample_intra_edge_high(above_row, n_px, bit_depth); } - upsample_left = - av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type); + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); if (need_left && upsample_left) { const int n_px = txhpx + (need_bottom ? txwpx : 0); - av1_upsample_intra_edge_high(left_col, n_px, xd->bd); + av1_upsample_intra_edge_high(left_col, n_px, bit_depth); } } highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, - upsample_above, upsample_left, p_angle, xd->bd); + upsample_above, upsample_left, p_angle, bit_depth); return; } // predict if (mode == DC_PRED) { dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size]( - dst, dst_stride, above_row, left_col, xd->bd); + dst, dst_stride, above_row, left_col, bit_depth); } else { - pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd); + pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth); } } #endif // CONFIG_AV1_HIGHBITDEPTH -static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, - int ref_stride, uint8_t *dst, int dst_stride, - PREDICTION_MODE mode, int angle_delta, - FILTER_INTRA_MODE filter_intra_mode, - TX_SIZE tx_size, int disable_edge_filter, - int n_top_px, int n_topright_px, - int n_left_px, int n_bottomleft_px, - int plane) { +static void build_intra_predictors( + const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, int angle_delta, FILTER_INTRA_MODE filter_intra_mode, + TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, + int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) { int i; const uint8_t *above_ref = ref - ref_stride; const uint8_t *left_ref = ref - 1; @@ -1462,33 +1458,32 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, if (!disable_edge_filter) { const int need_right = p_angle < 90; const int need_bottom = p_angle > 180; - const int filt_type = get_filt_type(xd, plane); if (p_angle != 90 && p_angle != 180) { const int ab_le = need_above_left ? 1 : 0; if (need_above && need_left && (txwpx + txhpx >= 24)) { filter_intra_edge_corner(above_row, left_col); } if (need_above && n_top_px > 0) { - const int strength = - intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type); + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); av1_filter_intra_edge(above_row - ab_le, n_px, strength); } if (need_left && n_left_px > 0) { const int strength = intra_edge_filter_strength( - txhpx, txwpx, p_angle - 180, filt_type); + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); av1_filter_intra_edge(left_col - ab_le, n_px, strength); } } - upsample_above = - av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type); + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); if (need_above && upsample_above) { const int n_px = txwpx + (need_right ? txhpx : 0); av1_upsample_intra_edge(above_row, n_px); } - upsample_left = - av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type); + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); if (need_left && upsample_left) { const int n_px = txhpx + (need_bottom ? txwpx : 0); av1_upsample_intra_edge(left_col, n_px); @@ -1559,11 +1554,14 @@ static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x, return bs; } -void av1_predict_intra_block( - const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx, - TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette, - FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) { +void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, + int enable_intra_edge_filter, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, + int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, + const uint8_t *ref, int ref_stride, uint8_t *dst, + int dst_stride, int col_off, int row_off, + int plane) { const MB_MODE_INFO *const mbmi = xd->mi[0]; const int txwpx = tx_size_wide[tx_size]; const int txhpx = tx_size_high[tx_size]; @@ -1626,32 +1624,32 @@ void av1_predict_intra_block( } const int have_top_right = - has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available, + has_top_right(sb_size, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size, row_off, col_off, ss_x, ss_y); - const int have_bottom_left = - has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left, - partition, tx_size, row_off, col_off, ss_x, ss_y); + const int have_bottom_left = has_bottom_left( + sb_size, bsize, mi_row, mi_col, bottom_available, have_left, partition, + tx_size, row_off, col_off, ss_x, ss_y); - const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter; + const int disable_edge_filter = !enable_intra_edge_filter; + const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane); #if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { build_intra_predictors_high( - xd, ref, ref_stride, dst, dst_stride, mode, angle_delta, - filter_intra_mode, tx_size, disable_edge_filter, - have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode, + tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, have_top_right ? AOMMIN(txwpx, xr) : 0, have_left ? AOMMIN(txhpx, yd + txhpx) : 0, - have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane); + have_bottom_left ? AOMMIN(txhpx, yd) : 0, intra_edge_filter_type, + xd->bd); return; } #endif - build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, - angle_delta, filter_intra_mode, tx_size, - disable_edge_filter, - have_top ? AOMMIN(txwpx, xr + txwpx) : 0, - have_top_right ? AOMMIN(txwpx, xr) : 0, - have_left ? AOMMIN(txhpx, yd + txhpx) : 0, - have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane); + build_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode, + tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, + have_top_right ? AOMMIN(txwpx, xr) : 0, + have_left ? AOMMIN(txhpx, yd + txhpx) : 0, + have_bottom_left ? AOMMIN(txhpx, yd) : 0, intra_edge_filter_type); } void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, @@ -1669,6 +1667,7 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, ? mbmi->filter_intra_mode_info.filter_intra_mode : FILTER_INTRA_MODES; const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP; + const SequenceHeader *seq_params = cm->seq_params; if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) { #if CONFIG_DEBUG @@ -1687,10 +1686,11 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, CFL_CTX *const cfl = &xd->cfl; CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane); if (cfl->dc_pred_is_cached[pred_plane] == 0) { - av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode, - angle_delta, use_palette, filter_intra_mode, dst, - dst_stride, dst, dst_stride, blk_col, blk_row, - plane); + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mode, angle_delta, + use_palette, filter_intra_mode, dst, dst_stride, + dst, dst_stride, blk_col, blk_row, plane); if (cfl->use_dc_pred_cache) { cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]); cfl->dc_pred_is_cached[pred_plane] = 1; @@ -1701,9 +1701,10 @@ void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, cfl_predict_block(xd, dst, dst_stride, tx_size, plane); return; } - av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode, - angle_delta, use_palette, filter_intra_mode, dst, - dst_stride, dst, dst_stride, blk_col, blk_row, plane); + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode, + dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane); } void av1_init_intra_predictors(void) { diff --git a/third_party/libaom/source/libaom/av1/common/reconintra.h b/third_party/libaom/source/libaom/av1/common/reconintra.h index 907db5daf8..fa66ccd541 100644 --- a/third_party/libaom/source/libaom/av1/common/reconintra.h +++ b/third_party/libaom/source/libaom/av1/common/reconintra.h @@ -26,11 +26,14 @@ void av1_init_intra_predictors(void); void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, int blk_col, int blk_row, TX_SIZE tx_size); -void av1_predict_intra_block( - const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx, - TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette, - FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride, int col_off, int row_off, int plane); +void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size, + int enable_intra_edge_filter, int wpx, int hpx, + TX_SIZE tx_size, PREDICTION_MODE mode, + int angle_delta, int use_palette, + FILTER_INTRA_MODE filter_intra_mode, + const uint8_t *ref, int ref_stride, uint8_t *dst, + int dst_stride, int col_off, int row_off, + int plane); // Mapping of interintra to intra mode for use in the intra component static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = { @@ -64,7 +67,7 @@ static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) { static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm, BLOCK_SIZE bs) { - if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0; + if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0; return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32; } diff --git a/third_party/libaom/source/libaom/av1/common/resize.c b/third_party/libaom/source/libaom/av1/common/resize.c index 0cfb5a29b8..112a08a539 100644 --- a/third_party/libaom/source/libaom/av1/common/resize.c +++ b/third_party/libaom/source/libaom/av1/common/resize.c @@ -1263,7 +1263,7 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int plane, int rows) { const int is_uv = (plane > 0); - const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_x = is_uv && cm->seq_params->subsampling_x; const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x); const int upscaled_plane_width = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x); @@ -1305,11 +1305,11 @@ void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src, const int pad_right = (j == cm->tiles.cols - 1); #if CONFIG_AV1_HIGHBITDEPTH - if (cm->seq_params.use_highbitdepth) + if (cm->seq_params->use_highbitdepth) highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, dst_stride, x_step_qn, x0_qn, pad_left, pad_right, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); else upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows, dst_width, dst_stride, x_step_qn, x0_qn, @@ -1354,18 +1354,18 @@ YV12_BUFFER_CONFIG *av1_scale_if_required( if (scaling_required) { const int num_planes = av1_num_planes(cm); #if CONFIG_AV1_HIGHBITDEPTH - if (use_optimized_scaler && cm->seq_params.bit_depth == AOM_BITS_8) { + if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8) { av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes); } else { av1_resize_and_extend_frame_nonnormative( - unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes); + unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes); } #else if (use_optimized_scaler) { av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes); } else { av1_resize_and_extend_frame_nonnormative( - unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes); + unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes); } #endif return scaled; @@ -1432,7 +1432,7 @@ static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src, void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { const int num_planes = av1_num_planes(cm); if (!av1_superres_scaled(cm)) return; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int byte_alignment = cm->features.byte_alignment; YV12_BUFFER_CONFIG copy_buffer; @@ -1445,7 +1445,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, byte_alignment)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate copy buffer for superres upscaling"); // Copy function assumes the frames are the same size. @@ -1468,7 +1468,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { if (release_fb_cb(cb_priv, fb)) { unlock_buffer_pool(pool); aom_internal_error( - &cm->error, AOM_CODEC_MEM_ERROR, + cm->error, AOM_CODEC_MEM_ERROR, "Failed to free current frame buffer before superres upscaling"); } // aom_realloc_frame_buffer() leaves config data for frame_to_show intact @@ -1479,7 +1479,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, 0)) { unlock_buffer_pool(pool); aom_internal_error( - &cm->error, AOM_CODEC_MEM_ERROR, + cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate current frame buffer for superres upscaling"); } unlock_buffer_pool(pool); @@ -1495,7 +1495,7 @@ void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) { seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, byte_alignment)) aom_internal_error( - &cm->error, AOM_CODEC_MEM_ERROR, + cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate current frame buffer for superres upscaling"); // Restore config data back to frame_to_show diff --git a/third_party/libaom/source/libaom/av1/common/restoration.c b/third_party/libaom/source/libaom/av1/common/restoration.c index 41d0e22501..202953c889 100644 --- a/third_party/libaom/source/libaom/av1/common/restoration.c +++ b/third_party/libaom/source/libaom/av1/common/restoration.c @@ -42,8 +42,8 @@ const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = { AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) { AV1PixelRect rect; - int ss_x = is_uv && cm->seq_params.subsampling_x; - int ss_y = is_uv && cm->seq_params.subsampling_y; + int ss_x = is_uv && cm->seq_params->subsampling_x; + int ss_y = is_uv && cm->seq_params->subsampling_y; rect.top = 0; rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y); @@ -1107,7 +1107,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int optimized_lr, int num_planes) { - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int bit_depth = seq_params->bit_depth; const int highbd = seq_params->use_highbitdepth; lr_ctxt->dst = &cm->rst_frame; @@ -1118,7 +1118,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, cm->features.byte_alignment, NULL, NULL, NULL, 0) < 0) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate restoration dst buffer"); lr_ctxt->on_rest_unit = filter_frame_on_unit; @@ -1299,7 +1299,7 @@ void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane, int32_t *tmpbuf, RestorationLineBuffers *rlbs) { const int is_uv = plane > 0; - const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int ss_y = is_uv && cm->seq_params->subsampling_y; const RestorationInfo *rsi = &cm->rst_info[plane]; @@ -1315,7 +1315,7 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, int *rrow1) { assert(rcol0 && rcol1 && rrow0 && rrow1); - if (bsize != cm->seq_params.sb_size) return 0; + if (bsize != cm->seq_params->sb_size) return 0; if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0; assert(!cm->features.all_lossless); @@ -1345,8 +1345,8 @@ int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane, const int vert_units = av1_lr_count_units_in_tile(size, tile_h); // The size of an MI-unit on this plane of the image - const int ss_x = is_uv && cm->seq_params.subsampling_x; - const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; const int mi_size_x = MI_SIZE >> ss_x; const int mi_size_y = MI_SIZE >> ss_y; @@ -1427,7 +1427,7 @@ static void save_deblock_boundary_lines( int upscaled_width; int line_bytes; if (av1_superres_scaled(cm)) { - const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_x = is_uv && cm->seq_params->subsampling_x; upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x; line_bytes = upscaled_width << use_highbd; if (use_highbd) @@ -1474,7 +1474,7 @@ static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame, // At the point where this function is called, we've already applied // superres. So we don't need to extend the lines here, we can just // pull directly from the topmost row of the upscaled frame. - const int ss_x = is_uv && cm->seq_params.subsampling_x; + const int ss_x = is_uv && cm->seq_params->subsampling_x; const int upscaled_width = av1_superres_scaled(cm) ? (cm->superres_upscaled_width + ss_x) >> ss_x : src_width; @@ -1494,7 +1494,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame, int use_highbd, int plane, AV1_COMMON *cm, int after_cdef) { const int is_uv = plane > 0; - const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int ss_y = is_uv && cm->seq_params->subsampling_y; const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y; @@ -1559,7 +1559,7 @@ static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame, void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int after_cdef) { const int num_planes = av1_num_planes(cm); - const int use_highbd = cm->seq_params.use_highbitdepth; + const int use_highbd = cm->seq_params->use_highbitdepth; for (int p = 0; p < num_planes; ++p) { save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef); } diff --git a/third_party/libaom/source/libaom/av1/common/thread_common.c b/third_party/libaom/source/libaom/av1/common/thread_common.c index 638dc4c951..0c45749de1 100644 --- a/third_party/libaom/source/libaom/av1/common/thread_common.c +++ b/third_party/libaom/source/libaom/av1/common/thread_common.c @@ -152,6 +152,61 @@ static void loop_filter_data_reset(LFWorkerData *lf_data, } } +void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync, + int num_workers) { + if (num_workers < 1) return; +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, cdef_sync->mutex_, + aom_malloc(sizeof(*(cdef_sync->mutex_)))); + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + } +#else + (void)cm; + (void)cdef_sync; +#endif // CONFIG_MULTITHREAD +} + +void av1_free_cdef_sync(AV1CdefSync *cdef_sync) { + if (cdef_sync == NULL) return; +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_ != NULL) { + pthread_mutex_destroy(cdef_sync->mutex_); + aom_free(cdef_sync->mutex_); + } +#endif // CONFIG_MULTITHREAD +} + +static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync, + int row) { + if (!row) return; +#if CONFIG_MULTITHREAD + AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt; + pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_); + while (cdef_row_mt[row - 1].is_row_done != 1) + pthread_cond_wait(cdef_row_mt[row - 1].row_cond_, + cdef_row_mt[row - 1].row_mutex_); + cdef_row_mt[row - 1].is_row_done = 0; + pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_); +#else + (void)cdef_sync; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync, + int row) { +#if CONFIG_MULTITHREAD + AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt; + pthread_mutex_lock(cdef_row_mt[row].row_mutex_); + pthread_cond_signal(cdef_row_mt[row].row_cond_); + cdef_row_mt[row].is_row_done = 1; + pthread_mutex_unlock(cdef_row_mt[row].row_mutex_); +#else + (void)cdef_sync; + (void)row; +#endif // CONFIG_MULTITHREAD +} + static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c, int plane) { #if CONFIG_MULTITHREAD @@ -211,7 +266,7 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start, #if CONFIG_LPF_MASK int is_decoding, #endif - int plane_start, int plane_end) { + int plane_start, int plane_end, int is_realtime) { int mi_row, plane, dir; AV1LfMTInfo *lf_job_queue = lf_sync->job_queue; lf_sync->jobs_enqueued = 0; @@ -238,6 +293,7 @@ static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start, lf_job_queue->mi_row = mi_row; lf_job_queue->plane = plane; lf_job_queue->dir = dir; + lf_job_queue->is_realtime = is_realtime; lf_job_queue++; lf_sync->jobs_enqueued++; } @@ -272,7 +328,7 @@ static INLINE void thread_loop_filter_rows( const int sb_cols = ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2; - int mi_row, mi_col, plane, dir; + int mi_row, mi_col, plane, dir, is_realtime; int r, c; while (1) { @@ -283,17 +339,29 @@ static INLINE void thread_loop_filter_rows( plane = cur_job_info->plane; dir = cur_job_info->dir; r = mi_row >> MAX_MIB_SIZE_LOG2; + is_realtime = cur_job_info->is_realtime && !plane; if (dir == 0) { for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) { c = mi_col >> MAX_MIB_SIZE_LOG2; - av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer, + av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer, mi_row, mi_col, plane, plane + 1); - +#if CONFIG_AV1_HIGHBITDEPTH + (void)is_realtime; av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row, mi_col); +#else + if (is_realtime) { + av1_filter_block_plane_vert_rt(cm, xd, plane, &planes[plane], + mi_row, mi_col); + + } else { + av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row, + mi_col); + } +#endif sync_write(lf_sync, r, c, sb_cols, plane); } } else if (dir == 1) { @@ -309,10 +377,21 @@ static INLINE void thread_loop_filter_rows( // completed sync_read(lf_sync, r + 1, c, plane); - av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer, + av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer, mi_row, mi_col, plane, plane + 1); +#if CONFIG_AV1_HIGHBITDEPTH + (void)is_realtime; av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row, mi_col); +#else + if (is_realtime) { + av1_filter_block_plane_horz_rt(cm, xd, plane, &planes[plane], + mi_row, mi_col); + } else { + av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row, + mi_col); + } +#endif } } } else { @@ -405,7 +484,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int is_decoding, #endif AVxWorker *workers, int nworkers, - AV1LfSync *lf_sync) { + AV1LfSync *lf_sync, int is_realtime) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); #if CONFIG_LPF_MASK int sb_rows; @@ -441,7 +520,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, #if CONFIG_LPF_MASK is_decoding, #endif - plane_start, plane_end); + plane_start, plane_end, is_realtime); // Set up loopfilter thread data. for (i = num_workers - 1; i >= 0; --i) { @@ -484,7 +563,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, int is_decoding, #endif AVxWorker *workers, int num_workers, - AV1LfSync *lf_sync) { + AV1LfSync *lf_sync, int is_realtime) { int start_mi_row, end_mi_row, mi_rows_to_filter; start_mi_row = 0; @@ -512,7 +591,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, // TODO(chengchen): can we remove this? struct macroblockd_plane *pd = xd->plane; - av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane, + av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame, 0, 0, plane, plane + 1); av1_build_bitmask_vert_info(cm, &pd[plane], plane); @@ -526,7 +605,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, } #else loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start, - plane_end, workers, num_workers, lf_sync); + plane_end, workers, num_workers, lf_sync, is_realtime); #endif } @@ -720,7 +799,7 @@ static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt, for (int plane = 0; plane < num_planes; plane++) { if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue; const int is_uv = plane > 0; - const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int ss_y = is_uv && cm->seq_params->subsampling_y; AV1PixelRect tile_rect = ctxt[plane].tile_rect; const int unit_size = ctxt[plane].rsi->restoration_unit_size; @@ -932,3 +1011,198 @@ void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, cm); } #endif + +// Initializes cdef_sync parameters. +static AOM_INLINE void reset_cdef_job_info(AV1CdefSync *const cdef_sync) { + cdef_sync->end_of_frame = 0; + cdef_sync->fbr = 0; + cdef_sync->fbc = 0; +} + +static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &workers[i]; + if (i == 0) + winterface->execute(worker); + else + winterface->launch(worker); + } +} + +static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers, + AV1_COMMON *const cm, + int num_workers) { + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + int had_error = 0; + + // Wait for completion of Cdef frame. + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &workers[i]; + had_error |= !winterface->sync(worker); + } + if (had_error) + aom_internal_error(cm->error, AOM_CODEC_ERROR, + "Failed to process cdef frame"); +} + +// Updates the row index of the next job to be processed. +// Also updates end_of_frame flag when the processing of all rows is complete. +static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync, + const int nvfb) { + cdef_sync->fbr++; + if (cdef_sync->fbr == nvfb) { + cdef_sync->end_of_frame = 1; + } +} + +// Checks if a job is available. If job is available, +// populates next job information and returns 1, else returns 0. +static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync, + int *cur_fbr, const int nvfb) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + int do_next_row = 0; + // Populates information needed for current job and update the row + // index of the next row to be processed. + if (cdef_sync->end_of_frame == 0) { + do_next_row = 1; + *cur_fbr = cdef_sync->fbr; + update_cdef_row_next_job_info(cdef_sync, nvfb); + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(cdef_sync->mutex_); +#endif // CONFIG_MULTITHREAD + return do_next_row; +} + +// Hook function for each thread in CDEF multi-threading. +static int cdef_sb_row_worker_hook(void *arg1, void *arg2) { + AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1; + AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2; + const int nvfb = + (cdef_worker->cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + int cur_fbr; + while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) { + av1_cdef_fb_row(cdef_worker->cm, cdef_worker->xd, cdef_worker->linebuf, + cdef_worker->colbuf, cdef_worker->srcbuf, cur_fbr, + cdef_worker->cdef_init_fb_row_fn, cdef_sync); + } + return 1; +} + +// Assigns CDEF hook function and thread data to each worker. +static void prepare_cdef_frame_workers( + AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker, + AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn) { + const int num_planes = av1_num_planes(cm); + + cdef_worker[0].srcbuf = cm->cdef_info.srcbuf; + for (int plane = 0; plane < num_planes; plane++) + cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane]; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &workers[i]; + cdef_worker[i].cm = cm; + cdef_worker[i].xd = xd; + cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn; + for (int plane = 0; plane < num_planes; plane++) + cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane]; + + worker->hook = hook; + worker->data1 = cdef_sync; + worker->data2 = &cdef_worker[i]; + } +} + +// Initializes row-level parameters for CDEF frame. +void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr) { + const int num_planes = av1_num_planes(cm); + const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; + const int luma_stride = + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4); + + // for the current filter block, it's top left corner mi structure (mi_tl) + // is first accessed to check whether the top and left boundaries are + // frame boundaries. Then bottom-left and top-right mi structures are + // accessed to check whether the bottom and right boundaries + // (respectively) are frame boundaries. + // + // Note that we can't just check the bottom-right mi structure - eg. if + // we're at the right-hand edge of the frame but not the bottom, then + // the bottom-right mi is NULL but the bottom-left is not. + fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0; + if (fbr != nvfb - 1) + fb_info->frame_boundary[BOTTOM] = + (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0; + else + fb_info->frame_boundary[BOTTOM] = 1; + + fb_info->src = src; + fb_info->damping = cm->cdef_info.cdef_damping; + fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); + av1_zero(fb_info->dir); + av1_zero(fb_info->var); + + for (int plane = 0; plane < num_planes; plane++) { + const int stride = luma_stride >> xd->plane[plane].subsampling_x; + uint16_t *top_linebuf = &linebuf[plane][0]; + uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride]; + { + const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y; + const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2; + + if (fbr != nvfb - 1) // if (fbr != 0) // top line buffer copy + av1_cdef_copy_sb8_16( + cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride, + xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + if (fbr != nvfb - 1) // bottom line buffer copy + av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride], + stride, xd->plane[plane].dst.buf, bot_offset, 0, + xd->plane[plane].dst.stride, CDEF_VBORDER, stride); + } + + fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride]; + fb_info->bot_linebuf[plane] = + &linebuf[plane] + [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)]; + } + + cdef_row_mt_sync_write(cdef_sync, fbr); + cdef_row_mt_sync_read(cdef_sync, fbr); +} + +// Implements multi-threading for CDEF. +// Perform CDEF on input frame. +// Inputs: +// frame: Pointer to input frame buffer. +// cm: Pointer to common structure. +// xd: Pointer to common current coding block structure. +// Returns: +// Nothing will be returned. +void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd, + AV1CdefWorkerData *const cdef_worker, + AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, + cdef_init_fb_row_t cdef_init_fb_row_fn) { + YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf; + const int num_planes = av1_num_planes(cm); + + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, + num_planes); + + reset_cdef_job_info(cdef_sync); + prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook, + workers, cdef_sync, num_workers, + cdef_init_fb_row_fn); + launch_cdef_workers(workers, num_workers); + sync_cdef_workers(workers, cm, num_workers); +} diff --git a/third_party/libaom/source/libaom/av1/common/thread_common.h b/third_party/libaom/source/libaom/av1/common/thread_common.h index 97b8abcff6..bcb4b879c1 100644 --- a/third_party/libaom/source/libaom/av1/common/thread_common.h +++ b/third_party/libaom/source/libaom/av1/common/thread_common.h @@ -15,6 +15,7 @@ #include "config/aom_config.h" #include "av1/common/av1_loopfilter.h" +#include "av1/common/cdef.h" #include "aom_util/aom_thread.h" #ifdef __cplusplus @@ -27,6 +28,7 @@ typedef struct AV1LfMTInfo { int mi_row; int plane; int dir; + int is_realtime; } AV1LfMTInfo; // Loopfilter row synchronization @@ -97,6 +99,55 @@ typedef struct AV1LrSyncData { int jobs_dequeued; } AV1LrSync; +typedef struct AV1CdefWorker { + AV1_COMMON *cm; + MACROBLOCKD *xd; + uint16_t *colbuf[MAX_MB_PLANE]; + uint16_t *srcbuf; + uint16_t *linebuf[MAX_MB_PLANE]; + cdef_init_fb_row_t cdef_init_fb_row_fn; +} AV1CdefWorkerData; + +typedef struct AV1CdefRowSync { +#if CONFIG_MULTITHREAD + pthread_mutex_t *row_mutex_; + pthread_cond_t *row_cond_; +#endif // CONFIG_MULTITHREAD + int is_row_done; +} AV1CdefRowSync; + +// Data related to CDEF search multi-thread synchronization. +typedef struct AV1CdefSyncData { +#if CONFIG_MULTITHREAD + // Mutex lock used while dispatching jobs. + pthread_mutex_t *mutex_; +#endif // CONFIG_MULTITHREAD + // Data related to CDEF row mt sync information + AV1CdefRowSync *cdef_row_mt; + // Flag to indicate all blocks are processed and end of frame is reached + int end_of_frame; + // Row index in units of 64x64 block + int fbr; + // Column index in units of 64x64 block + int fbc; +} AV1CdefSync; + +void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd, + AV1CdefWorkerData *const cdef_worker, + AVxWorker *const workers, AV1CdefSync *const cdef_sync, + int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn); +void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm, + const MACROBLOCKD *const xd, + CdefBlockInfo *const fb_info, + uint16_t **const linebuf, uint16_t *const src, + struct AV1CdefSyncData *const cdef_sync, int fbr); +void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst, + int dstride, const uint8_t *src, int src_voffset, + int src_hoffset, int sstride, int vsize, int hsize); +void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync, + int num_workers); +void av1_free_cdef_sync(AV1CdefSync *cdef_sync); + // Deallocate loopfilter synchronization related mutex and data. void av1_loop_filter_dealloc(AV1LfSync *lf_sync); @@ -107,7 +158,7 @@ void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, int is_decoding, #endif AVxWorker *workers, int num_workers, - AV1LfSync *lf_sync); + AV1LfSync *lf_sync, int is_realtime); #if !CONFIG_REALTIME_ONLY void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame, diff --git a/third_party/libaom/source/libaom/av1/common/tile_common.c b/third_party/libaom/source/libaom/av1/common/tile_common.c index 1b11bd7606..8f5d2a6316 100644 --- a/third_party/libaom/source/libaom/av1/common/tile_common.c +++ b/third_party/libaom/source/libaom/av1/common/tile_common.c @@ -28,7 +28,7 @@ static int tile_log2(int blk_size, int target) { } void av1_get_tile_limits(AV1_COMMON *const cm) { - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; CommonTileParams *const tiles = &cm->tiles; const int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); @@ -130,9 +130,9 @@ void av1_calculate_tile_rows(const SequenceHeader *const seq_params, void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) { assert(row < cm->tiles.rows); int mi_row_start = cm->tiles.row_start_sb[row] - << cm->seq_params.mib_size_log2; + << cm->seq_params->mib_size_log2; int mi_row_end = cm->tiles.row_start_sb[row + 1] - << cm->seq_params.mib_size_log2; + << cm->seq_params->mib_size_log2; tile->tile_row = row; tile->mi_row_start = mi_row_start; tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows); @@ -142,9 +142,9 @@ void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) { void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { assert(col < cm->tiles.cols); int mi_col_start = cm->tiles.col_start_sb[col] - << cm->seq_params.mib_size_log2; + << cm->seq_params->mib_size_log2; int mi_col_end = cm->tiles.col_start_sb[col + 1] - << cm->seq_params.mib_size_log2; + << cm->seq_params->mib_size_log2; tile->tile_col = col; tile->mi_col_start = mi_col_start; tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols); @@ -153,16 +153,16 @@ void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) { int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) { int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO( - tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2); - int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + tile.mi_row_end - tile.mi_row_start, cm->seq_params->mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params->mib_size_log2; return sb_rows; } int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) { int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO( - tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2); - int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2; + tile.mi_col_end - tile.mi_col_start, cm->seq_params->mib_size_log2); + int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params->mib_size_log2; return sb_cols; } @@ -195,8 +195,8 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, r.bottom = AOMMIN(r.bottom, frame_h); // Convert to coordinates in the appropriate plane - const int ss_x = is_uv && cm->seq_params.subsampling_x; - const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; r.left = ROUND_POWER_OF_TWO(r.left, ss_x); r.right = ROUND_POWER_OF_TWO(r.right, ss_x); @@ -215,7 +215,7 @@ void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { for (int i = 0; i < tiles->cols; ++i) { const int tile_width_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; - const int tile_w = tile_width_sb * cm->seq_params.mib_size; + const int tile_w = tile_width_sb * cm->seq_params->mib_size; assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension *w = tile_w; } @@ -223,7 +223,7 @@ void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { for (int i = 0; i < tiles->rows; ++i) { const int tile_height_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; - const int tile_h = tile_height_sb * cm->seq_params.mib_size; + const int tile_h = tile_height_sb * cm->seq_params->mib_size; assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension *h = tile_h; } diff --git a/third_party/libaom/source/libaom/av1/decoder/decodeframe.c b/third_party/libaom/source/libaom/av1/decoder/decodeframe.c index b364714e0a..9ca7d3cd35 100644 --- a/third_party/libaom/source/libaom/av1/decoder/decodeframe.c +++ b/third_party/libaom/source/libaom/av1/decoder/decodeframe.c @@ -76,12 +76,11 @@ // Checks that the remaining bits start with a 1 and ends with 0s. // It consumes an additional byte, if already byte aligned before the check. int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { - AV1_COMMON *const cm = &pbi->common; // bit_offset is set to 0 (mod 8) when the reader is already byte aligned int bits_before_alignment = 8 - rb->bit_offset % 8; int trailing = aom_rb_read_literal(rb, bits_before_alignment); if (trailing != (1 << (bits_before_alignment - 1))) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } return 0; @@ -304,16 +303,18 @@ static AOM_INLINE void decode_reconstruct_tx( const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int sub_step = bsw * bsh; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); assert(bsw > 0 && bsh > 0); - for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { - for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { - const int offsetr = blk_row + row; + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { const int offsetc = blk_col + col; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr, offsetc, block, sub_txs, eob_total); block += sub_step; @@ -362,7 +363,7 @@ static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi, PARTITION_TYPE partition, BLOCK_SIZE bsize) { AV1_COMMON *const cm = &pbi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int bw = mi_size_wide[bsize]; const int bh = mi_size_high[bsize]; const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col); @@ -914,6 +915,16 @@ static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi, if (plane && !xd->is_chroma_ref) break; const struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_SIZE tx_size = av1_get_tx_size(plane, xd); +#if CONFIG_REALTIME_ONLY + // Realtime only build doesn't support 4x rectangular txfm sizes. + if (tx_size == TX_4X16 || tx_size == TX_16X4 || tx_size == TX_8X32 || + tx_size == TX_32X8 || tx_size == TX_16X64 || + tx_size == TX_64X16) { + aom_internal_error( + xd->error_info, AOM_CODEC_UNSUP_FEATURE, + "Realtime only build doesn't support rectangular txfm sizes"); + } +#endif const int stepr = tx_size_high_unit[tx_size]; const int stepc = tx_size_wide_unit[tx_size]; @@ -1219,9 +1230,9 @@ static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi, : (j == 1 ? quant_params->u_ac_delta_q : quant_params->v_ac_delta_q); xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX( - current_qindex, dc_delta_q, cm->seq_params.bit_depth); + current_qindex, dc_delta_q, cm->seq_params->bit_depth); xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX( - current_qindex, ac_delta_q, cm->seq_params.bit_depth); + current_qindex, ac_delta_q, cm->seq_params->bit_depth); } } } @@ -1554,9 +1565,9 @@ static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm, } } if (!all_none) { - assert(cm->seq_params.sb_size == BLOCK_64X64 || - cm->seq_params.sb_size == BLOCK_128X128); - const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64; + assert(cm->seq_params->sb_size == BLOCK_64X64 || + cm->seq_params->sb_size == BLOCK_128X128); + const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64; for (int p = 0; p < num_planes; ++p) cm->rst_info[p].restoration_unit_size = sb_size; @@ -1576,7 +1587,8 @@ static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm, } if (num_planes > 1) { - int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y); + int s = + AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y); if (s && !chroma_none) { cm->rst_info[1].restoration_unit_size = cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s); @@ -1847,7 +1859,7 @@ static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params, // Build y/uv dequant values based on segmentation. static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm, MACROBLOCKD *const xd) { - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; // When segmentation is disabled, only the first value is used. The // remaining are don't cares. const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1; @@ -1909,7 +1921,7 @@ static AOM_INLINE void setup_superres(AV1_COMMON *const cm, cm->superres_upscaled_width = *width; cm->superres_upscaled_height = *height; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; if (!seq_params->enable_superres) return; if (aom_rb_read_bit(rb)) { @@ -1930,7 +1942,7 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width, int height) { #if CONFIG_SIZE_LIMIT if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Dimensions of %dx%d beyond allowed size of %dx%d.", width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); #endif @@ -1950,7 +1962,7 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width, // consistent and to force a realloc next time. cm->width = 0; cm->height = 0; - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } } else { @@ -1968,7 +1980,7 @@ static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width, static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) { BufferPool *const pool = cm->buffer_pool; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; lock_buffer_pool(pool); if (aom_realloc_frame_buffer( @@ -1978,7 +1990,7 @@ static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) { &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0)) { unlock_buffer_pool(pool); - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } unlock_buffer_pool(pool); @@ -1999,7 +2011,7 @@ static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) { static AOM_INLINE void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag, struct aom_read_bit_buffer *rb) { - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; int width, height; if (frame_size_override_flag) { @@ -2008,7 +2020,7 @@ static AOM_INLINE void setup_frame_size(AV1_COMMON *cm, av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height); if (width > seq_params->max_frame_width || height > seq_params->max_frame_height) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Frame dimensions are larger than the maximum values"); } } else { @@ -2049,7 +2061,7 @@ static AOM_INLINE void setup_frame_size_with_refs( // the middle of a stream, and static analysis will error if we don't do // a null check here. if (ref_buf == NULL) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Invalid condition: invalid reference buffer"); } else { const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf; @@ -2065,7 +2077,7 @@ static AOM_INLINE void setup_frame_size_with_refs( } } - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; if (!found) { int num_bits_width = seq_params->num_bits_width; int num_bits_height = seq_params->num_bits_height; @@ -2077,7 +2089,7 @@ static AOM_INLINE void setup_frame_size_with_refs( } if (width <= 0 || height <= 0) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Invalid frame size"); // Check to make sure at least one of frames that this frame references @@ -2089,7 +2101,7 @@ static AOM_INLINE void setup_frame_size_with_refs( ref_frame->buf.y_crop_height, width, height); } if (!has_valid_ref_frame) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Referenced frame has invalid size"); for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); @@ -2097,7 +2109,7 @@ static AOM_INLINE void setup_frame_size_with_refs( ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x, ref_frame->buf.subsampling_y, seq_params->bit_depth, seq_params->subsampling_x, seq_params->subsampling_y)) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Referenced frame has incompatible color format"); } setup_buffer_pool(cm); @@ -2117,7 +2129,7 @@ static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) { static AOM_INLINE void read_tile_info_max_tile( AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) { - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; CommonTileParams *const tiles = &cm->tiles; int width_mi = ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2); @@ -2213,7 +2225,7 @@ static AOM_INLINE void read_tile_info(AV1Decoder *const pbi, pbi->context_update_tile_id = aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols); if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Invalid context_update_tile_id"); } // tile size magnitude @@ -2366,7 +2378,7 @@ static const uint8_t *get_ls_tile_buffers( // Get the whole of the last column, otherwise stop at the required tile. for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) { - get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data, + get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data, tile_buffers, tile_size_bytes, c, r, tile_copy_mode); } } @@ -2378,7 +2390,7 @@ static const uint8_t *get_ls_tile_buffers( data = tile_col_data_end[c - 1]; for (int r = 0; r < tile_rows; ++r) { - get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data, + get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data, tile_buffers, tile_size_bytes, c, r, tile_copy_mode); } } @@ -2446,11 +2458,11 @@ static AOM_INLINE void get_tile_buffers( if (tc < start_tile || tc > end_tile) continue; if (data + hdr_offset >= data_end) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Data ended before all tiles were read."); data += hdr_offset; - get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, - &pbi->common.error, &data, buf); + get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error, + &data, buf); } } } @@ -2460,7 +2472,7 @@ static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, DecoderCodingBlock *dcb, const int num_planes, int mi_row, int mi_col) { AV1_COMMON *const cm = &pbi->common; - int mib_size_log2 = cm->seq_params.mib_size_log2; + int mib_size_log2 = cm->seq_params->mib_size_log2; int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1; int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); CB_BUFFER *cb_buffer = cb_buffer_base + offset; @@ -2629,11 +2641,11 @@ static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td, pbi->tile_data + tile_info.tile_row * cm->tiles.cols + tile_info.tile_col; const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info); const int sb_row_in_tile = - (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2; + (mi_row - tile_info.mi_row_start) >> cm->seq_params->mib_size_log2; int sb_col_in_tile = 0; for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; - mi_col += cm->seq_params.mib_size, sb_col_in_tile++) { + mi_col += cm->seq_params->mib_size, sb_col_in_tile++) { set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col); @@ -2641,7 +2653,7 @@ static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td, // Decoding of the super-block decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, - cm->seq_params.sb_size, 0x2); + cm->seq_params->sb_size, 0x2); sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile, sb_cols_in_tile); @@ -2711,16 +2723,16 @@ static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td, av1_reset_loop_restoration(xd, num_planes); for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; - mi_row += cm->seq_params.mib_size) { + mi_row += cm->seq_params->mib_size) { av1_zero_left_context(xd); for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; - mi_col += cm->seq_params.mib_size) { + mi_col += cm->seq_params->mib_size) { set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0); // Bit-stream parsing and decoding of the superblock decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, - cm->seq_params.sb_size, 0x3); + cm->seq_params->sb_size, 0x3); if (aom_reader_has_overflowed(td->bit_reader)) { aom_merge_corrupted_flag(&dcb->corrupted, 1); @@ -2801,6 +2813,10 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { decoder_alloc_tile_data(pbi, n_tiles); } + if (pbi->dcb.xd.seg_mask == NULL) + CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); #if CONFIG_ACCOUNTING if (pbi->acct_enabled) { aom_accounting_reset(&pbi->accounting); @@ -2837,7 +2853,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, av1_tile_init(&td->dcb.xd.tile, cm, row, col); td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex; setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size, - &cm->error, td->bit_reader, allow_update_cdf); + &pbi->error, td->bit_reader, allow_update_cdf); #if CONFIG_ACCOUNTING if (pbi->acct_enabled) { td->bit_reader->accounting = &pbi->accounting; @@ -2859,7 +2875,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, decode_tile(pbi, td, row, col); aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted); if (pbi->dcb.corrupted) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Failed to decode tile data"); } } @@ -3017,7 +3033,7 @@ static int get_next_job_info(AV1Decoder *const pbi, const int tile_cols_end = frame_row_mt_info->tile_cols_end; const int start_tile = frame_row_mt_info->start_tile; const int end_tile = frame_row_mt_info->end_tile; - const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size]; + const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; int num_mis_to_decode, num_threads_working; int num_mis_waiting_for_decode; int min_threads_working = INT_MAX; @@ -3135,7 +3151,7 @@ static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi, static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td, TileDataDec *const tile_data) { AV1_COMMON *const cm = &pbi->common; - const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size]; + const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size]; const int num_planes = av1_num_planes(cm); TileInfo tile_info = tile_data->tile_info; int tile_row = tile_info.tile_row; @@ -3148,16 +3164,16 @@ static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td, av1_reset_loop_restoration(xd, num_planes); for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; - mi_row += cm->seq_params.mib_size) { + mi_row += cm->seq_params->mib_size) { av1_zero_left_context(xd); for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; - mi_col += cm->seq_params.mib_size) { + mi_col += cm->seq_params->mib_size) { set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col); // Bit-stream parsing of the superblock decode_partition(pbi, td, mi_row, mi_col, td->bit_reader, - cm->seq_params.sb_size, 0x1); + cm->seq_params->sb_size, 0x1); if (aom_reader_has_overflowed(td->bit_reader)) { aom_merge_corrupted_flag(&dcb->corrupted, 1); @@ -3357,6 +3373,8 @@ void av1_free_mc_tmp_buf(ThreadData *thread_data) { aom_free(thread_data->tmp_conv_dst); thread_data->tmp_conv_dst = NULL; + aom_free(thread_data->seg_mask); + thread_data->seg_mask = NULL; for (int i = 0; i < 2; ++i) { aom_free(thread_data->tmp_obmc_bufs[i]); thread_data->tmp_obmc_bufs[i] = NULL; @@ -3389,6 +3407,10 @@ static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm, CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst, aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*thread_data->tmp_conv_dst))); + CHECK_MEM_ERROR(cm, thread_data->seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(*thread_data->seg_mask))); + for (int i = 0; i < 2; ++i) { CHECK_MEM_ERROR( cm, thread_data->tmp_obmc_bufs[i], @@ -3411,6 +3433,8 @@ static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi, thread_data->td->dcb.mc_buf[0] = thread_data->td->mc_buf[0]; thread_data->td->dcb.mc_buf[1] = thread_data->td->mc_buf[1]; thread_data->td->dcb.xd.tmp_conv_dst = thread_data->td->tmp_conv_dst; + if (worker_idx) + thread_data->td->dcb.xd.seg_mask = thread_data->td->seg_mask; for (int j = 0; j < 2; ++j) { thread_data->td->dcb.xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j]; @@ -3481,7 +3505,7 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) { winterface->init(worker); worker->thread_name = "aom tile worker"; if (worker_idx != 0 && !winterface->reset(worker)) { - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "Tile decoder thread creation failed"); } @@ -3498,7 +3522,7 @@ static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) { thread_data->error_info.setjmp = 0; } } - const int use_highbd = cm->seq_params.use_highbitdepth; + const int use_highbd = cm->seq_params->use_highbitdepth; const int buf_size = MC_TEMP_BUF_PELS << use_highbd; for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) { DecWorkerData *const thread_data = pbi->thread_data + worker_idx; @@ -3590,6 +3614,10 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) { decoder_alloc_tile_data(pbi, n_tiles); } + if (pbi->dcb.xd.seg_mask == NULL) + CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); for (int row = 0; row < tile_rows; row++) { for (int col = 0; col < tile_cols; col++) { @@ -3606,7 +3634,7 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, sync_dec_workers(pbi, num_workers); if (pbi->dcb.corrupted) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Failed to decode tile data"); if (tiles->large_scale) { @@ -3624,8 +3652,8 @@ static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data, static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) { AV1_COMMON *const cm = &pbi->common; - int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) * - ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1); + int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) * + ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1); if (pbi->cb_buffer_alloc_size < size) { av1_dec_free_cb_buf(pbi); @@ -3669,10 +3697,10 @@ static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start, tile_data->dec_row_mt_sync.num_threads_working = 0; tile_data->dec_row_mt_sync.mi_rows = ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start, - cm->seq_params.mib_size_log2); + cm->seq_params->mib_size_log2); tile_data->dec_row_mt_sync.mi_cols = ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start, - cm->seq_params.mib_size_log2); + cm->seq_params->mib_size_log2); frame_row_mt_info->mi_rows_to_decode += tile_data->dec_row_mt_sync.mi_rows; @@ -3776,6 +3804,10 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, } decoder_alloc_tile_data(pbi, n_tiles); } + if (pbi->dcb.xd.seg_mask == NULL) + CHECK_MEM_ERROR(cm, pbi->dcb.xd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(*pbi->dcb.xd.seg_mask))); for (int row = 0; row < tile_rows; row++) { for (int col = 0; col < tile_cols; col++) { @@ -3811,7 +3843,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, sync_dec_workers(pbi, num_workers); if (pbi->dcb.corrupted) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Failed to decode tile data"); if (tiles->large_scale) { @@ -3829,7 +3861,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, static AOM_INLINE void error_handler(void *data) { AV1_COMMON *const cm = (AV1_COMMON *)data; - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet"); + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet"); } // Reads the high_bitdepth and twelve_bit fields in color_config() and sets @@ -3860,7 +3892,7 @@ static AOM_INLINE void read_bitdepth( void av1_read_film_grain_params(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { aom_film_grain_t *pars = &cm->film_grain_params; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; pars->apply_grain = aom_rb_read_bit(rb); if (!pars->apply_grain) { @@ -3890,7 +3922,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm, } } if (!found) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Invalid film grain reference idx %d. ref_frame_idx = " "{%d, %d, %d, %d, %d, %d, %d}", film_grain_params_ref_idx, cm->remapped_ref_idx[0], @@ -3900,11 +3932,11 @@ void av1_read_film_grain_params(AV1_COMMON *cm, } RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx]; if (buf == NULL) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Invalid Film grain reference idx"); } if (!buf->film_grain_params_present) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Film grain reference parameters not available"); } uint16_t random_seed = pars->random_seed; @@ -3916,13 +3948,13 @@ void av1_read_film_grain_params(AV1_COMMON *cm, // Scaling functions parameters pars->num_y_points = aom_rb_read_literal(rb, 4); // max 14 if (pars->num_y_points > 14) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Number of points for film grain luma scaling function " "exceeds the maximum value."); for (int i = 0; i < pars->num_y_points; i++) { pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8); if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0]) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "First coordinate of the scaling function points " "shall be increasing."); pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8); @@ -3941,14 +3973,14 @@ void av1_read_film_grain_params(AV1_COMMON *cm, } else { pars->num_cb_points = aom_rb_read_literal(rb, 4); // max 10 if (pars->num_cb_points > 10) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Number of points for film grain cb scaling function " "exceeds the maximum value."); for (int i = 0; i < pars->num_cb_points; i++) { pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8); if (i && pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0]) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "First coordinate of the scaling function points " "shall be increasing."); pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8); @@ -3956,14 +3988,14 @@ void av1_read_film_grain_params(AV1_COMMON *cm, pars->num_cr_points = aom_rb_read_literal(rb, 4); // max 10 if (pars->num_cr_points > 10) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Number of points for film grain cr scaling function " "exceeds the maximum value."); for (int i = 0; i < pars->num_cr_points; i++) { pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8); if (i && pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0]) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "First coordinate of the scaling function points " "shall be increasing."); pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8); @@ -3972,7 +4004,7 @@ void av1_read_film_grain_params(AV1_COMMON *cm, if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) && (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) || ((pars->num_cb_points != 0) && (pars->num_cr_points == 0)))) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "In YCbCr 4:2:0, film grain shall be applied " "to both chroma components or neither."); } @@ -4024,13 +4056,13 @@ void av1_read_film_grain_params(AV1_COMMON *cm, static AOM_INLINE void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { - if (cm->seq_params.film_grain_params_present && + if (cm->seq_params->film_grain_params_present && (cm->show_frame || cm->showable_frame)) { av1_read_film_grain_params(cm, rb); } else { memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } - cm->film_grain_params.bit_depth = cm->seq_params.bit_depth; + cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params, sizeof(aom_film_grain_t)); } @@ -4164,7 +4196,7 @@ void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params, static AOM_INLINE void read_temporal_point_info( AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) { cm->frame_presentation_time = aom_rb_read_unsigned_literal( - rb, cm->seq_params.decoder_model_info.frame_presentation_time_length); + rb, cm->seq_params->decoder_model_info.frame_presentation_time_length); } void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, @@ -4192,7 +4224,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, seq_params->frame_id_length = aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1; if (seq_params->frame_id_length > 16) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Invalid frame_id_length"); } @@ -4446,7 +4478,7 @@ static INLINE void reset_frame_buffers(AV1_COMMON *cm) { static int read_uncompressed_header(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) { AV1_COMMON *const cm = &pbi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; CurrentFrame *const current_frame = &cm->current_frame; FeatureFlags *const features = &cm->features; MACROBLOCKD *const xd = &pbi->dcb.xd; @@ -4457,7 +4489,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, sframe_info->is_s_frame_at_altref = 0; if (!pbi->sequence_header_ready) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "No sequence header"); } @@ -4479,14 +4511,14 @@ static int read_uncompressed_header(AV1Decoder *pbi, if (cm->show_existing_frame) { if (pbi->sequence_header_changed) { aom_internal_error( - &cm->error, AOM_CODEC_CORRUPT_FRAME, + &pbi->error, AOM_CODEC_CORRUPT_FRAME, "New sequence header starts with a show_existing_frame."); } // Show an existing frame directly. const int existing_frame_idx = aom_rb_read_literal(rb, 3); RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx]; if (frame_to_show == NULL) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Buffer does not contain a decoded frame"); } if (seq_params->decoder_model_info_present_flag && @@ -4500,7 +4532,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, * referencing */ if (display_frame_id != cm->ref_frame_id[existing_frame_idx] || pbi->valid_for_referencing[existing_frame_idx] == 0) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Reference buffer frame ID mismatch"); } lock_buffer_pool(pool); @@ -4526,7 +4558,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, // show_existing_frame is used to show a previous frame, that the value // of showable_frame for the previous frame was equal to 1. if (!frame_to_show->showable_frame) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Buffer does not contain a showable frame"); } // Section 6.8.2: It is a requirement of bitstream conformance that when @@ -4554,7 +4586,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, pbi->decoding_first_frame = 1; reset_frame_buffers(cm); } else { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Sequence header has changed without a keyframe."); } } @@ -4569,7 +4601,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, } if (seq_params->still_picture && (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Still pictures must be coded as shown keyframes"); } cm->showable_frame = current_frame->frame_type != KEY_FRAME; @@ -4641,7 +4673,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, /* Check current_frame_id for conformance */ if (prev_frame_id == cm->current_frame_id || diff_frame_id >= (1 << (frame_id_length - 1))) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Invalid value of current_frame_id"); } } @@ -4672,18 +4704,18 @@ static int read_uncompressed_header(AV1Decoder *pbi, } if (seq_params->decoder_model_info_present_flag) { - cm->buffer_removal_time_present = aom_rb_read_bit(rb); - if (cm->buffer_removal_time_present) { + pbi->buffer_removal_time_present = aom_rb_read_bit(rb); + if (pbi->buffer_removal_time_present) { for (int op_num = 0; op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { if (seq_params->op_params[op_num].decoder_model_param_present_flag) { - if ((((seq_params->operating_point_idc[op_num] >> + if (seq_params->operating_point_idc[op_num] == 0 || + (((seq_params->operating_point_idc[op_num] >> cm->temporal_layer_id) & 0x1) && ((seq_params->operating_point_idc[op_num] >> (cm->spatial_layer_id + 8)) & - 0x1)) || - seq_params->operating_point_idc[op_num] == 0) { + 0x1))) { cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal( rb, seq_params->decoder_model_info.buffer_removal_time_length); } else { @@ -4713,7 +4745,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, if (current_frame->frame_type == INTRA_ONLY_FRAME) { current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); if (current_frame->refresh_frame_flags == 0xFF) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Intra only frames cannot have refresh flags 0xFF"); } if (pbi->need_resync) { @@ -4747,7 +4779,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, // pixels set to neutral grey. int buf_idx = get_free_fb(cm); if (buf_idx == INVALID_IDX) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Unable to find free frame buffer"); } buf = &frame_bufs[buf_idx]; @@ -4760,7 +4792,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0)) { decrease_ref_count(buf, pool); unlock_buffer_pool(pool); - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } unlock_buffer_pool(pool); @@ -4827,10 +4859,10 @@ static int read_uncompressed_header(AV1Decoder *pbi, // reference to a slot that hasn't been set yet. That's what we are // checking here. if (lst_buf == NULL) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); if (gld_buf == NULL) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref); @@ -4848,7 +4880,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, // reference to a slot that hasn't been set yet. That's what we are // checking here. if (cm->ref_frame_map[ref] == NULL) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); cm->remapped_ref_idx[i] = ref; } else { @@ -4856,7 +4888,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, } // Check valid for referencing if (pbi->valid_for_referencing[ref] == 0) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Reference frame not valid for referencing"); cm->ref_frame_sign_bias[LAST_FRAME + i] = 0; @@ -4872,7 +4904,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, // Compare values derived from delta_frame_id_minus_1 and // refresh_frame_flags. if (ref_frame_id != cm->ref_frame_id[ref]) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Reference buffer frame ID mismatch"); } } @@ -4895,7 +4927,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->prev_frame = get_primary_ref_frame_buf(cm); if (features->primary_ref_frame != PRIMARY_REF_NONE && get_primary_ref_frame_buf(cm) == NULL) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Reference frame containing this frame's initial " "frame context is unavailable."); } @@ -4915,7 +4947,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, ref_scale_factors, ref_buf->buf.y_crop_width, ref_buf->buf.y_crop_height, cm->width, cm->height); if ((!av1_is_valid_scale(ref_scale_factors))) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); } } @@ -4952,7 +4984,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->cur_frame->buf.render_height = cm->render_height; if (pbi->need_resync) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Keyframe / intra-only frame required to reset decoder" " state"); } @@ -4973,13 +5005,13 @@ static int read_uncompressed_header(AV1Decoder *pbi, read_tile_info(pbi, rb); if (!av1_is_min_tile_width_satisfied(cm)) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Minimum tile width requirement not satisfied"); } CommonQuantParams *const quant_params = &cm->quant_params; setup_quantization(quant_params, av1_num_planes(cm), - cm->seq_params.separate_uv_delta_q, rb); + cm->seq_params->separate_uv_delta_q, rb); xd->bd = (int)seq_params->bit_depth; CommonContexts *const above_contexts = &cm->above_contexts; @@ -4990,7 +5022,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, cm->mi_params.mi_cols, av1_num_planes(cm))) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } } @@ -5070,7 +5102,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, features->reduced_tx_set_used = aom_rb_read_bit(rb); if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Frame wrongly requests reference frame MVs"); } @@ -5170,7 +5202,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, // Use the default frame context values. *cm->fc = *cm->default_frame_context; if (!cm->fc->initialized) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Uninitialized entropy context."); } return uncomp_hdr_size; @@ -5180,8 +5212,8 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, av1_setup_motion_field(cm); - av1_setup_block_planes(xd, cm->seq_params.subsampling_x, - cm->seq_params.subsampling_y, num_planes); + av1_setup_block_planes(xd, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, num_planes); if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { // use the default frame context values *cm->fc = *cm->default_frame_context; @@ -5189,7 +5221,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, *cm->fc = get_primary_ref_frame_buf(cm)->frame_context; } if (!cm->fc->initialized) - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Uninitialized entropy context."); pbi->dcb.corrupted = 0; @@ -5207,7 +5239,7 @@ static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) { av1_alloc_restoration_buffers(cm); } #endif - const int use_highbd = cm->seq_params.use_highbitdepth; + const int use_highbd = cm->seq_params->use_highbitdepth; const int buf_size = MC_TEMP_BUF_PELS << use_highbd; if (pbi->td.mc_buf_size != buf_size) { av1_free_mc_tmp_buf(&pbi->td); @@ -5242,13 +5274,17 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, // If the bit stream is monochrome, set the U and V buffers to a constant. if (num_planes < 3) { - set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1); + set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1); } if (end_tile != tiles->rows * tiles->cols - 1) { return; } + av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync, + pbi->num_workers); + av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers); + if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) { if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) { if (pbi->num_workers > 1) { @@ -5257,13 +5293,13 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, #if CONFIG_LPF_MASK 1, #endif - pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync); + pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync, 0); } else { av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->dcb.xd, #if CONFIG_LPF_MASK 1, #endif - 0, num_planes, 0); + 0, num_planes, 0, 0); } } @@ -5285,7 +5321,14 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, cm, 0); if (do_cdef) { - av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd); + if (pbi->num_workers > 1) { + av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker, + pbi->tile_workers, &pbi->cdef_sync, + pbi->num_workers, av1_cdef_init_fb_row_mt); + } else { + av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd, + av1_cdef_init_fb_row); + } } superres_post_decode(pbi); @@ -5323,7 +5366,14 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, #else if (!optimized_loop_restoration) { if (do_cdef) { - av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd); + if (pbi->num_workers > 1) { + av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker, + pbi->tile_workers, &pbi->cdef_sync, + pbi->num_workers, av1_cdef_init_fb_row_mt); + } else { + av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd, + av1_cdef_init_fb_row); + } } } #endif // !CONFIG_REALTIME_ONLY @@ -5339,7 +5389,7 @@ void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data, av1_reset_cdf_symbol_counters(cm->fc); } } else { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Decode failed. Frame data is corrupted."); } diff --git a/third_party/libaom/source/libaom/av1/decoder/decodemv.c b/third_party/libaom/source/libaom/av1/decoder/decodemv.c index 412be86989..839bda2be6 100644 --- a/third_party/libaom/source/libaom/av1/decoder/decodemv.c +++ b/third_party/libaom/source/libaom/av1/decoder/decodemv.c @@ -46,7 +46,7 @@ static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) { // At the start of a superblock, mark that we haven't yet read CDEF strengths // for any of the CDEF units contained in this superblock. - const int sb_mask = (cm->seq_params.mib_size - 1); + const int sb_mask = (cm->seq_params->mib_size - 1); const int mi_row_in_sb = (xd->mi_row & sb_mask); const int mi_col_in_sb = (xd->mi_col & sb_mask); if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { @@ -61,7 +61,7 @@ static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) { const int index_mask = cdef_size; const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); - const int index = (cm->seq_params.sb_size == BLOCK_128X128) + const int index = (cm->seq_params->sb_size == BLOCK_128X128) ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb : 0; @@ -85,12 +85,12 @@ static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd, aom_reader *r, MB_MODE_INFO *const mbmi) { int sign, abs, reduced_delta_qindex = 0; BLOCK_SIZE bsize = mbmi->bsize; - const int b_col = xd->mi_col & (cm->seq_params.mib_size - 1); - const int b_row = xd->mi_row & (cm->seq_params.mib_size - 1); + const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1); + const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1); const int read_delta_q_flag = (b_col == 0 && b_row == 0); FRAME_CONTEXT *ec_ctx = xd->tile_ctx; - if ((bsize != cm->seq_params.sb_size || mbmi->skip_txfm == 0) && + if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) && read_delta_q_flag) { abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR); const int smallval = (abs < DELTA_Q_SMALL); @@ -117,11 +117,11 @@ static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r, int mi_row) { int reduced_delta_lflevel = 0; const BLOCK_SIZE bsize = mbmi->bsize; - const int b_col = mi_col & (cm->seq_params.mib_size - 1); - const int b_row = mi_row & (cm->seq_params.mib_size - 1); + const int b_col = mi_col & (cm->seq_params->mib_size - 1); + const int b_row = mi_row & (cm->seq_params->mib_size - 1); const int read_delta_lf_flag = (b_col == 0 && b_row == 0); - if ((bsize != cm->seq_params.sb_size || mbmi->skip_txfm == 0) && + if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) && read_delta_lf_flag) { int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR); const int smallval = (abs < DELTA_LF_SMALL); @@ -579,7 +579,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx], PALETTE_SIZES, ACCT_STR) + 2; - read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r); + read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r); } } if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) { @@ -591,7 +591,7 @@ static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], PALETTE_SIZES, ACCT_STR) + 2; - read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r); + read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r); } } } @@ -682,7 +682,7 @@ static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv, mv->as_mv.row = (mv->as_mv.row >> 3) * 8; int valid = is_mv_valid(&mv->as_mv) && av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize, - cm->seq_params.mib_size_log2); + cm->seq_params->mib_size_log2); return valid; } @@ -711,7 +711,7 @@ static void read_intrabc_info(AV1_COMMON *const cm, DecoderCodingBlock *dcb, av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0); int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; if (dv_ref.as_int == 0) - av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, xd->mi_row); + av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row); // Ref DV should not have sub-pel. int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0; dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8; @@ -816,7 +816,7 @@ static void read_intra_frame_mode_info(AV1_COMMON *const cm, ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED]) : 0; - if (!cm->seq_params.monochrome && xd->is_chroma_ref) { + if (!cm->seq_params->monochrome && xd->is_chroma_ref) { mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode); if (mbmi->uv_mode == UV_CFL_PRED) { @@ -1076,7 +1076,7 @@ static void read_intra_block_mode_info(AV1_COMMON *const cm, use_angle_delta && av1_is_directional_mode(mbmi->mode) ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED]) : 0; - if (!cm->seq_params.monochrome && xd->is_chroma_ref) { + if (!cm->seq_params->monochrome && xd->is_chroma_ref) { mbmi->uv_mode = read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode); if (mbmi->uv_mode == UV_CFL_PRED) { @@ -1375,7 +1375,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag); mbmi->use_wedge_interintra = 0; - if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode && + if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode && is_interintra_allowed(mbmi)) { const int bsize_group = size_group_lookup[bsize]; const int interintra = @@ -1423,7 +1423,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, if (has_second_ref(mbmi) && !mbmi->skip_mode) { // Read idx to indicate current compound inter prediction mode group const int masked_compound_used = is_any_masked_compound_used(bsize) && - cm->seq_params.enable_masked_compound; + cm->seq_params->enable_masked_compound; if (masked_compound_used) { const int ctx_comp_group_idx = get_comp_group_idx_context(xd); @@ -1432,7 +1432,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, } if (mbmi->comp_group_idx == 0) { - if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) { + if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) { const int comp_index_ctx = get_comp_index_context(cm, xd); mbmi->compound_idx = (uint8_t)aom_read_symbol( r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR); @@ -1473,7 +1473,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, } read_mb_interp_filter(xd, features->interp_filter, - cm->seq_params.enable_dual_filter, mbmi, r); + cm->seq_params->enable_dual_filter, mbmi, r); #if !CONFIG_REALTIME_ONLY if (mbmi->motion_mode == WARPED_CAUSAL) { @@ -1573,11 +1573,11 @@ void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, if (frame_is_intra_only(cm)) { read_intra_frame_mode_info(cm, dcb, r); - if (cm->seq_params.order_hint_info.enable_ref_frame_mvs) + if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis); } else { read_inter_frame_mode_info(pbi, dcb, r); - if (cm->seq_params.order_hint_info.enable_ref_frame_mvs) + if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis); } } diff --git a/third_party/libaom/source/libaom/av1/decoder/decoder.c b/third_party/libaom/source/libaom/av1/decoder/decoder.c index 1680734a09..40dd71cea2 100644 --- a/third_party/libaom/source/libaom/av1/decoder/decoder.c +++ b/third_party/libaom/source/libaom/av1/decoder/decoder.c @@ -97,17 +97,19 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) { av1_zero(*pbi); AV1_COMMON *volatile const cm = &pbi->common; + cm->seq_params = &pbi->seq_params; + cm->error = &pbi->error; // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. - if (setjmp(cm->error.jmp)) { - cm->error.setjmp = 0; + if (setjmp(pbi->error.jmp)) { + pbi->error.setjmp = 0; av1_decoder_remove(pbi); return NULL; } - cm->error.setjmp = 1; + pbi->error.setjmp = 1; CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); @@ -129,7 +131,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) { pbi->decoding_first_frame = 1; pbi->common.buffer_pool = pool; - cm->seq_params.bit_depth = AOM_BITS_8; + cm->seq_params->bit_depth = AOM_BITS_8; cm->mi_params.free_mi = dec_free_mi; cm->mi_params.setup_mi = dec_setup_mi; @@ -146,7 +148,7 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) { aom_accounting_init(&pbi->accounting); #endif - cm->error.setjmp = 0; + pbi->error.setjmp = 0; aom_get_worker_interface()->init(&pbi->lf_worker); pbi->lf_worker.thread_name = "aom lf worker"; @@ -194,6 +196,7 @@ void av1_decoder_remove(AV1Decoder *pbi) { } aom_free(pbi->thread_data); } + aom_free(pbi->dcb.xd.seg_mask); for (i = 0; i < pbi->num_workers; ++i) { AVxWorker *const worker = &pbi->tile_workers[i]; @@ -261,16 +264,16 @@ aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx, const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx); if (cfg == NULL) { - aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame"); + aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "No reference frame"); return AOM_CODEC_ERROR; } if (!equal_dimensions(cfg, sd)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); else aom_yv12_copy_frame(cfg, sd, num_planes); - return cm->error.error_code; + return pbi->error.error_code; } static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, @@ -293,13 +296,13 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, ref_buf = get_ref_frame(cm, idx); if (ref_buf == NULL) { - aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame"); + aom_internal_error(cm->error, AOM_CODEC_ERROR, "No reference frame"); return AOM_CODEC_ERROR; } if (!use_external_ref) { if (!equal_dimensions(ref_buf, sd)) { - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); } else { // Overwrite the reference frame buffer. @@ -307,7 +310,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, } } else { if (!equal_dimensions_and_border(ref_buf, sd)) { - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); } else { // Overwrite the reference frame buffer pointers. @@ -323,7 +326,7 @@ aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx, } } - return cm->error.error_code; + return cm->error->error_code; } aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, @@ -332,12 +335,12 @@ aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, const int num_planes = av1_num_planes(cm); if (!equal_dimensions_and_border(new_frame, sd)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); else aom_yv12_copy_frame(new_frame, sd, num_planes); - return cm->error.error_code; + return cm->error->error_code; } static void release_current_frame(AV1Decoder *pbi) { @@ -355,7 +358,7 @@ static void release_current_frame(AV1Decoder *pbi) { // Consumes a reference to cm->cur_frame. // // This functions returns void. It reports failure by setting -// cm->error.error_code. +// pbi->error.error_code. static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) { int ref_index = 0, mask; AV1_COMMON *const cm = &pbi->common; @@ -388,7 +391,7 @@ static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) { // error cm->cur_frame->buf.corrupted = 1; decrease_ref_count(cm->cur_frame, pool); - cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; } else { pbi->output_frames[pbi->num_output_frames] = cm->cur_frame; pbi->num_output_frames++; @@ -427,8 +430,8 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, const uint8_t **psource) { AV1_COMMON *volatile const cm = &pbi->common; const uint8_t *source = *psource; - cm->error.error_code = AOM_CODEC_OK; - cm->error.has_detail = 0; + pbi->error.error_code = AOM_CODEC_OK; + pbi->error.has_detail = 0; if (size == 0) { // This is used to signal that we are missing frames. @@ -444,18 +447,18 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, } if (assign_cur_frame_new_fb(cm) == NULL) { - cm->error.error_code = AOM_CODEC_MEM_ERROR; + pbi->error.error_code = AOM_CODEC_MEM_ERROR; return 1; } // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. - if (setjmp(cm->error.jmp)) { + if (setjmp(pbi->error.jmp)) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); int i; - cm->error.setjmp = 0; + pbi->error.setjmp = 0; // Synchronize all threads immediately as a subsequent decode call may // cause a resize invalidating some allocations. @@ -469,15 +472,15 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, return -1; } - cm->error.setjmp = 1; + pbi->error.setjmp = 1; int frame_decoded = aom_decode_frame_from_obus(pbi, source, source + size, psource); if (frame_decoded < 0) { - assert(cm->error.error_code != AOM_CODEC_OK); + assert(pbi->error.error_code != AOM_CODEC_OK); release_current_frame(pbi); - cm->error.setjmp = 0; + pbi->error.setjmp = 0; return 1; } @@ -498,8 +501,8 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, pbi->decoding_first_frame = 0; } - if (cm->error.error_code != AOM_CODEC_OK) { - cm->error.setjmp = 0; + if (pbi->error.error_code != AOM_CODEC_OK) { + pbi->error.setjmp = 0; return 1; } @@ -518,7 +521,7 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, } // Update progress in frame parallel decode. - cm->error.setjmp = 0; + pbi->error.setjmp = 0; return 0; } diff --git a/third_party/libaom/source/libaom/av1/decoder/decoder.h b/third_party/libaom/source/libaom/av1/decoder/decoder.h index b20e9c1dda..226b9dca85 100644 --- a/third_party/libaom/source/libaom/av1/decoder/decoder.h +++ b/third_party/libaom/source/libaom/av1/decoder/decoder.h @@ -112,6 +112,8 @@ typedef struct ThreadData { // Motion compensation buffer used to get a prediction buffer with extended // borders. One buffer for each of the two possible references. uint8_t *mc_buf[2]; + // Mask for this block used for compound prediction. + uint8_t *seg_mask; // Allocated size of 'mc_buf'. int32_t mc_buf_size; // If true, the pointers in 'mc_buf' were converted from highbd pointers. @@ -227,6 +229,8 @@ typedef struct AV1Decoder { AV1LfSync lf_row_sync; AV1LrSync lr_row_sync; AV1LrStruct lr_ctxt; + AV1CdefSync cdef_sync; + AV1CdefWorkerData *cdef_worker; AVxWorker *tile_workers; int num_workers; DecWorkerData *thread_data; @@ -330,6 +334,32 @@ typedef struct AV1Decoder { int is_arf_frame_present; int num_tile_groups; aom_s_frame_info sframe_info; + + /*! + * Elements part of the sequence header, that are applicable for all the + * frames in the video. + */ + SequenceHeader seq_params; + + /*! + * If true, buffer removal times are present. + */ + bool buffer_removal_time_present; + + /*! + * Code and details about current error status. + */ + struct aom_internal_error_info error; + + /*! + * Number of temporal layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_temporal_layers; + + /*! + * Number of spatial layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_spatial_layers; } AV1Decoder; // Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error diff --git a/third_party/libaom/source/libaom/av1/decoder/obu.c b/third_party/libaom/source/libaom/av1/decoder/obu.c index d3d1f0e8be..6c80148cc9 100644 --- a/third_party/libaom/source/libaom/av1/decoder/obu.c +++ b/third_party/libaom/source/libaom/av1/decoder/obu.c @@ -69,7 +69,7 @@ static int byte_alignment(AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) { while (rb->bit_offset & 7) { if (aom_rb_read_bit(rb)) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + cm->error->error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } } @@ -110,12 +110,12 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, // Use a local variable to store the information as we decode. At the end, // if no errors have occurred, cm->seq_params is updated. - SequenceHeader sh = cm->seq_params; + SequenceHeader sh = *cm->seq_params; SequenceHeader *const seq_params = &sh; seq_params->profile = av1_read_profile(rb); if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) { - cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } @@ -124,7 +124,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb); // Video must have reduced_still_picture_hdr = 0 if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) { - cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } @@ -135,7 +135,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, seq_params->operating_points_cnt_minus_1 = 0; seq_params->operating_point_idc[0] = 0; if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) { - cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } seq_params->tier[0] = 0; @@ -144,7 +144,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, } else { seq_params->timing_info_present = aom_rb_read_bit(rb); if (seq_params->timing_info_present) { - av1_read_timing_info_header(&seq_params->timing_info, &cm->error, rb); + av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb); seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb); if (seq_params->decoder_model_info_present_flag) @@ -159,7 +159,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, seq_params->operating_point_idc[i] = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS); if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) { - cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7 @@ -188,7 +188,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass // the check if (seq_params->op_params[i].bitrate == 0) - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "AV1 does not support this combination of " "profile, level, and tier."); // Buffer size in bits/s is bitrate in bits/s * 1 s @@ -212,7 +212,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, aom_rb_read_literal(rb, 4) + 1; if (seq_params->op_params[i].initial_display_delay > 10) aom_internal_error( - &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + &pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "AV1 does not support more than 10 decoded frames delay"); } else { seq_params->op_params[i].initial_display_delay = 10; @@ -232,19 +232,19 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, pbi->current_operating_point = seq_params->operating_point_idc[operating_point]; if (aom_get_num_layers_from_operating_point_idc( - pbi->current_operating_point, &cm->number_spatial_layers, - &cm->number_temporal_layers) != AOM_CODEC_OK) { - cm->error.error_code = AOM_CODEC_ERROR; + pbi->current_operating_point, &pbi->number_spatial_layers, + &pbi->number_temporal_layers) != AOM_CODEC_OK) { + pbi->error.error_code = AOM_CODEC_ERROR; return 0; } av1_read_sequence_header(cm, rb, seq_params); - av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error); + av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error); if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) && !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) && !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, " "%d %d subsampling is not supported.\n", seq_params->subsampling_x, seq_params->subsampling_y); @@ -253,18 +253,18 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, seq_params->film_grain_params_present = aom_rb_read_bit(rb); if (av1_check_trailing_bits(pbi, rb) != 0) { - // cm->error.error_code is already set. + // pbi->error.error_code is already set. return 0; } // If a sequence header has been decoded before, we check if the new // one is consistent with the old one. if (pbi->sequence_header_ready) { - if (!are_seq_headers_consistent(&cm->seq_params, seq_params)) + if (!are_seq_headers_consistent(cm->seq_params, seq_params)) pbi->sequence_header_changed = 1; } - cm->seq_params = *seq_params; + *cm->seq_params = *seq_params; pbi->sequence_header_ready = 1; return ((rb->bit_offset - saved_bit_offset + 7) >> 3); @@ -303,7 +303,7 @@ static int32_t read_tile_group_header(AV1Decoder *pbi, tile_start_and_end_present_flag = aom_rb_read_bit(rb); if (tile_start_implicit && tile_start_and_end_present_flag) { aom_internal_error( - &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + &pbi->error, AOM_CODEC_UNSUP_BITSTREAM, "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0"); return -1; } @@ -318,20 +318,20 @@ static int32_t read_tile_group_header(AV1Decoder *pbi, *end_tile = aom_rb_read_literal(rb, tile_bits); } if (*start_tile != pbi->next_start_tile) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "tg_start (%d) must be equal to %d", *start_tile, pbi->next_start_tile); return -1; } if (*start_tile > *end_tile) { aom_internal_error( - &cm->error, AOM_CODEC_CORRUPT_FRAME, + &pbi->error, AOM_CODEC_CORRUPT_FRAME, "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile, *start_tile); return -1; } if (*end_tile >= num_tiles) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "tg_end (%d) must be less than NumTiles (%d)", *end_tile, num_tiles); return -1; @@ -388,15 +388,16 @@ static void alloc_tile_list_buffer(AV1Decoder *pbi) { (pbi->output_frame_height_in_tiles_minus_1 + 1)); // Allocate the tile list output buffer. - // Note: if cm->seq_params.use_highbitdepth is 1 and cm->seq_params.bit_depth - // is 8, we could allocate less memory, namely, 8 bits/pixel. + // Note: if cm->seq_params->use_highbitdepth is 1 and + // cm->seq_params->bit_depth is 8, we could allocate less memory, namely, 8 + // bits/pixel. if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width, - output_frame_height, cm->seq_params.subsampling_x, - cm->seq_params.subsampling_y, - (cm->seq_params.use_highbitdepth && - (cm->seq_params.bit_depth > AOM_BITS_8)), + output_frame_height, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, + (cm->seq_params->use_highbitdepth && + (cm->seq_params->bit_depth > AOM_BITS_8)), 0, cm->features.byte_alignment)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate the tile list output buffer"); } @@ -430,8 +431,8 @@ static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, av1_get_uniform_tile_size(cm, &tile_width, &tile_height); const int tile_width_in_pixels = tile_width * MI_SIZE; const int tile_height_in_pixels = tile_height * MI_SIZE; - const int ssy = cm->seq_params.subsampling_y; - const int ssx = cm->seq_params.subsampling_x; + const int ssy = cm->seq_params->subsampling_y; + const int ssx = cm->seq_params->subsampling_x; const int num_planes = av1_num_planes(cm); YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf; @@ -455,8 +456,8 @@ static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, int vstart2 = tr * h; int hstart2 = tc * w; - if (cm->seq_params.use_highbitdepth && - cm->seq_params.bit_depth == AOM_BITS_8) { + if (cm->seq_params->use_highbitdepth && + cm->seq_params->bit_depth == AOM_BITS_8) { yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1, &pbi->tile_list_outbuf, hstart2, vstart2, plane); } else { @@ -501,7 +502,7 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16); if (pbi->tile_count_minus_1 > MAX_TILES - 1) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } @@ -524,7 +525,7 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, // Set reference for each tile. int ref_idx = aom_rb_read_literal(rb, 8); if (ref_idx >= MAX_EXTERNAL_REFERENCES) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1, @@ -535,14 +536,14 @@ static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi, if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 || pbi->dec_tile_row >= cm->tiles.rows || pbi->dec_tile_col >= cm->tiles.cols) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1; data += tile_info_bytes; if ((size_t)(data_end - data) < pbi->coded_tile_data_size) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } @@ -581,18 +582,17 @@ static void alloc_read_metadata(AV1Decoder *const pbi, OBU_METADATA_TYPE metadata_type, const uint8_t *data, size_t sz, aom_metadata_insert_flags_t insert_flag) { - AV1_COMMON *const cm = &pbi->common; if (!pbi->metadata) { pbi->metadata = aom_img_metadata_array_alloc(0); if (!pbi->metadata) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate metadata array"); } } aom_metadata_t *metadata = aom_img_metadata_alloc(metadata_type, data, sz, insert_flag); if (!metadata) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Error allocating metadata"); } aom_metadata_t **metadata_array = @@ -600,7 +600,7 @@ static void alloc_read_metadata(AV1Decoder *const pbi, (pbi->metadata->sz + 1) * sizeof(metadata)); if (!metadata_array) { aom_img_metadata_free(metadata); - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Error growing metadata array"); } pbi->metadata->metadata_array = metadata_array; @@ -611,22 +611,21 @@ static void alloc_read_metadata(AV1Decoder *const pbi, // On failure, calls aom_internal_error() and does not return. static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data, size_t sz) { - AV1_COMMON *const cm = &pbi->common; if (sz == 0) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "itu_t_t35_country_code is missing"); } int country_code_size = 1; if (*data == 0xFF) { if (sz == 1) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "itu_t_t35_country_code_extension_byte is missing"); } ++country_code_size; } int end_index = get_last_nonzero_byte_index(data, sz); if (end_index < country_code_size) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "No trailing bits found in ITU-T T.35 metadata OBU"); } // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says: @@ -634,7 +633,7 @@ static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data, // specified in Recommendation ITU-T T.35. // Therefore the first trailing byte should be 0x80. if (data[end_index] != 0x80) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "The last nonzero byte of the ITU-T T.35 metadata OBU " "is 0x%02x, should be 0x80.", data[end_index]); @@ -648,9 +647,8 @@ static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data, static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data, size_t sz) { const size_t kHdrCllPayloadSize = 4; - AV1_COMMON *const cm = &pbi->common; if (sz < kHdrCllPayloadSize) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Incorrect HDR CLL metadata payload size"); } alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize, @@ -663,9 +661,8 @@ static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data, static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data, size_t sz) { const size_t kMdcvPayloadSize = 24; - AV1_COMMON *const cm = &pbi->common; if (sz < kMdcvPayloadSize) { - aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, "Incorrect HDR MDCV metadata payload size"); } alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize, @@ -770,11 +767,10 @@ static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) { // pbi->common.error.error_code and returns 0, or calls aom_internal_error() // and does not return. static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) { - AV1_COMMON *const cm = &pbi->common; size_t type_length; uint64_t type_value; if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value; @@ -782,7 +778,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) { // If metadata_type is reserved for future use or a user private value, // ignore the entire OBU and just check trailing bits. if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } return sz; @@ -796,7 +792,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) { type_length + read_metadata_hdr_cll(pbi, data + type_length, sz - type_length); if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } return sz; @@ -805,7 +801,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) { type_length + read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length); if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } return sz; @@ -820,7 +816,7 @@ static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) { read_metadata_timecode(&rb); } if (av1_check_trailing_bits(pbi, &rb) != 0) { - // cm->error.error_code is already set. + // pbi->error.error_code is already set. return 0; } assert((rb.bit_offset & 7) == 0); @@ -838,7 +834,7 @@ static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data, // trailing byte should be 0x80. See https://crbug.com/aomedia/2393. const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz); if (last_nonzero_byte != 0x80) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + cm->error->error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } } @@ -846,7 +842,7 @@ static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data, } // On success, returns a boolean that indicates whether the decoding of the -// current frame is finished. On failure, sets cm->error.error_code and +// current frame is finished. On failure, sets pbi->error.error_code and // returns -1. int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end, @@ -872,7 +868,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, pbi->num_tile_groups = 0; if (data_end < data) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } @@ -880,7 +876,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0; // decode frame as a series of OBUs - while (!frame_decoding_finished && cm->error.error_code == AOM_CODEC_OK) { + while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) { struct aom_read_bit_buffer rb; size_t payload_size = 0; size_t decoded_payload_size = 0; @@ -890,7 +886,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, if (bytes_available == 0 && !pbi->seen_frame_header) { *p_data_end = data; - cm->error.error_code = AOM_CODEC_OK; + pbi->error.error_code = AOM_CODEC_OK; break; } @@ -899,7 +895,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, &obu_header, &payload_size, &bytes_read); if (status != AOM_CODEC_OK) { - cm->error.error_code = status; + pbi->error.error_code = status; return -1; } @@ -912,7 +908,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, data += bytes_read; if ((size_t)(data_end - data) < payload_size) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } @@ -936,16 +932,16 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, if (pbi->seen_frame_header) { // A new temporal unit has started, but the frame in the previous // temporal unit is incomplete. - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } break; case OBU_SEQUENCE_HEADER: decoded_payload_size = read_sequence_header_obu(pbi, &rb); - if (cm->error.error_code != AOM_CODEC_OK) return -1; + if (pbi->error.error_code != AOM_CODEC_OK) return -1; // The sequence header should not change in the middle of a frame. if (pbi->sequence_header_changed && pbi->seen_frame_header) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } break; @@ -954,13 +950,13 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, case OBU_FRAME: if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) { if (!pbi->seen_frame_header) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } } else { // OBU_FRAME_HEADER or OBU_FRAME. if (pbi->seen_frame_header) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } } @@ -978,7 +974,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, // frame_header_obu. if (frame_header_size > payload_size || memcmp(data, frame_header, frame_header_size) != 0) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } assert(rb.bit_offset == 0); @@ -990,7 +986,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, if (cm->show_existing_frame) { if (obu_header.type == OBU_FRAME) { - cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return -1; } frame_decoding_finished = 1; @@ -1012,23 +1008,23 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, if (obu_header.type != OBU_FRAME) break; obu_payload_offset = frame_header_size; // Byte align the reader before reading the tile group. - // byte_alignment() has set cm->error.error_code if it returns -1. + // byte_alignment() has set pbi->error.error_code if it returns -1. if (byte_alignment(cm, &rb)) return -1; AOM_FALLTHROUGH_INTENDED; // fall through to read tile group. case OBU_TILE_GROUP: if (!pbi->seen_frame_header) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } if (obu_payload_offset > payload_size) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } decoded_payload_size += read_one_tile_group_obu( pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset, data + payload_size, p_data_end, &frame_decoding_finished, obu_header.type == OBU_FRAME); - if (cm->error.error_code != AOM_CODEC_OK) return -1; + if (pbi->error.error_code != AOM_CODEC_OK) return -1; is_first_tg_obu_received = 0; if (frame_decoding_finished) { pbi->seen_frame_header = 0; @@ -1038,18 +1034,18 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, break; case OBU_METADATA: decoded_payload_size = read_metadata(pbi, data, payload_size); - if (cm->error.error_code != AOM_CODEC_OK) return -1; + if (pbi->error.error_code != AOM_CODEC_OK) return -1; break; case OBU_TILE_LIST: if (CONFIG_NORMAL_TILE_MODE) { - cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; + pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return -1; } // This OBU type is purely for the large scale tile coding mode. // The common camera frame header has to be already decoded. if (!pbi->camera_frame_header_ready) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } @@ -1058,17 +1054,17 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, decoded_payload_size = read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size, p_data_end, &frame_decoding_finished); - if (cm->error.error_code != AOM_CODEC_OK) return -1; + if (pbi->error.error_code != AOM_CODEC_OK) return -1; break; case OBU_PADDING: decoded_payload_size = read_padding(cm, data, payload_size); - if (cm->error.error_code != AOM_CODEC_OK) return -1; + if (pbi->error.error_code != AOM_CODEC_OK) return -1; break; default: // Skip unrecognized OBUs if (payload_size > 0 && get_last_nonzero_byte(data, payload_size) == 0) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } decoded_payload_size = payload_size; @@ -1077,7 +1073,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, // Check that the signalled OBU size matches the actual amount of data read if (decoded_payload_size > payload_size) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } @@ -1085,7 +1081,7 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, while (decoded_payload_size < payload_size) { uint8_t padding_byte = data[decoded_payload_size++]; if (padding_byte != 0) { - cm->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return -1; } } @@ -1093,6 +1089,6 @@ int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data, data += payload_size; } - if (cm->error.error_code != AOM_CODEC_OK) return -1; + if (pbi->error.error_code != AOM_CODEC_OK) return -1; return frame_decoding_finished; } diff --git a/third_party/libaom/source/libaom/av1/encoder/aq_complexity.c b/third_party/libaom/source/libaom/av1/encoder/aq_complexity.c index 3ea5f63020..278e1ca92f 100644 --- a/third_party/libaom/source/libaom/av1/encoder/aq_complexity.c +++ b/third_party/libaom/source/libaom/av1/encoder/aq_complexity.c @@ -81,7 +81,7 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { if (is_frame_aq_enabled(cpi)) { int segment; const int aq_strength = - get_aq_c_strength(base_qindex, cm->seq_params.bit_depth); + get_aq_c_strength(base_qindex, cm->seq_params->bit_depth); // Clear down the segment map. memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG, @@ -108,7 +108,7 @@ void av1_setup_in_frame_q_adj(AV1_COMP *cpi) { qindex_delta = av1_compute_qdelta_by_rate( &cpi->rc, cm->current_frame.frame_type, base_qindex, aq_c_q_adj_factor[aq_strength][segment], cpi->is_screen_content_type, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); // For AQ complexity mode, we dont allow Q0 in a segment if the base // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment @@ -150,17 +150,17 @@ void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs, // It is converted to bits << AV1_PROB_COST_SHIFT units. const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis) << AV1_PROB_COST_SHIFT; - const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size; + const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size; const int target_rate = (int)(num / denom); double logvar; double low_var_thresh; const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); aom_clear_system_state(); low_var_thresh = (is_stat_consumption_stage_twopass(cpi)) - ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH) + ? AOMMAX(exp(cpi->ppi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH) : DEFAULT_LV_THRESH; av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs); diff --git a/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.c b/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.c index c7abe43c87..40b8c254d4 100644 --- a/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.c +++ b/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.c @@ -12,6 +12,7 @@ #include <limits.h> #include <math.h> +#include "av1/common/pred_common.h" #include "av1/common/seg_common.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/ratectrl.h" @@ -82,7 +83,7 @@ static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) { const RATE_CONTROL *const rc = &cpi->rc; int deltaq = av1_compute_qdelta_by_rate( rc, cpi->common.current_frame.frame_type, q, rate_factor, - cpi->is_screen_content_type, cpi->common.seq_params.bit_depth); + cpi->is_screen_content_type, cpi->common.seq_params->bit_depth); if ((-deltaq) > cr->max_qdelta_perc * q / 100) { deltaq = -cr->max_qdelta_perc * q / 100; } @@ -94,7 +95,7 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi, const AV1_COMMON *const cm = &cpi->common; const FRAME_TYPE frame_type = cm->current_frame.frame_type; const int base_qindex = cm->quant_params.base_qindex; - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int mbs = cm->mi_params.MBs; const int num4x4bl = mbs << 4; @@ -138,15 +139,51 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i, bits_per_mb = (int)((1.0 - weight_segment) * av1_rc_bits_per_mb(cm->current_frame.frame_type, i, - correction_factor, cm->seq_params.bit_depth, + correction_factor, cm->seq_params->bit_depth, cpi->is_screen_content_type) + weight_segment * av1_rc_bits_per_mb(cm->current_frame.frame_type, i + deltaq, correction_factor, - cm->seq_params.bit_depth, + cm->seq_params->bit_depth, cpi->is_screen_content_type)); return bits_per_mb; } +void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize) { + int cdf_num; + const AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int prev_segment_id = mbmi->segment_id; + mbmi->segment_id = av1_get_spatial_seg_pred(cm, xd, &cdf_num); + if (prev_segment_id != mbmi->segment_id) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = mi_size_wide[bsize]; + const int bh = mi_size_high[bsize]; + const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw); + const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_params.mi_cols + mi_col; + for (int mi_y = 0; mi_y < ymis; mi_y++) { + for (int mi_x = 0; mi_x < xmis; mi_x++) { + const int map_offset = + block_index + mi_y * cm->mi_params.mi_cols + mi_x; + cr->map[map_offset] = 0; + cpi->enc_seg.map[map_offset] = mbmi->segment_id; + cm->cur_frame->seg_map[map_offset] = mbmi->segment_id; + } + } + if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1) + x->actual_num_seg1_blocks -= xmis * ymis; + else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2) + x->actual_num_seg2_blocks -= xmis * ymis; + if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1) + x->actual_num_seg1_blocks += xmis * ymis; + else if (cyclic_refresh_segment_id(mbmi->segment_id) == + CR_SEGMENT_ID_BOOST2) + x->actual_num_seg2_blocks += xmis * ymis; + } +} + void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize, int64_t rate, int64_t dist, int skip, @@ -191,22 +228,21 @@ void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x, // Update entries in the cyclic refresh map with new_map_value, and // copy mbmi->segment_id into global segmentation map. - // 8x8 is smallest coding block size for non-key frames. - const int sh = bw << 1; - for (int mi_y = 0; mi_y < ymis; mi_y += 2) { - for (int mi_x = 0; mi_x < xmis; mi_x += 2) { - int map_offset = block_index + mi_y * cm->mi_params.mi_cols + mi_x; + for (int mi_y = 0; mi_y < ymis; mi_y++) { + for (int mi_x = 0; mi_x < xmis; mi_x++) { + const int map_offset = block_index + mi_y * cm->mi_params.mi_cols + mi_x; cr->map[map_offset] = new_map_value; cpi->enc_seg.map[map_offset] = mbmi->segment_id; + cm->cur_frame->seg_map[map_offset] = mbmi->segment_id; } - // Accumulate cyclic refresh update counters. - if (!dry_run && !frame_is_intra_only(cm)) { - if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1) - x->actual_num_seg1_blocks += sh; - else if (cyclic_refresh_segment_id(mbmi->segment_id) == - CR_SEGMENT_ID_BOOST2) - x->actual_num_seg2_blocks += sh; - } + } + // Accumulate cyclic refresh update counters. + if (!dry_run) { + if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1) + x->actual_num_seg1_blocks += xmis * ymis; + else if (cyclic_refresh_segment_id(mbmi->segment_id) == + CR_SEGMENT_ID_BOOST2) + x->actual_num_seg2_blocks += xmis * ymis; } } @@ -234,15 +270,15 @@ void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) { const int avg_cnt_zeromv = 100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols); - if (!cpi->use_svc || - (cpi->use_svc && + if (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4; // For SVC: set avg_frame_low_motion (only computed on top spatial layer) // to all lower spatial layers. - if (cpi->use_svc && + if (cpi->ppi->use_svc && svc->spatial_layer_id == svc->number_spatial_layers - 1) { for (int i = 0; i < svc->number_spatial_layers - 1; ++i) { const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, @@ -257,15 +293,16 @@ void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) { void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) { RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; // Set minimum gf_interval for GF update to a multiple of the refresh period, // with some max limit. Depending on past encoding stats, GF flag may be // reset and update may not occur until next baseline_gf_interval. if (cr->percent_refresh > 0) - rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40); + p_rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40); else - rc->baseline_gf_interval = 20; - if (rc->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8; + p_rc->baseline_gf_interval = 20; + if (rc->avg_frame_low_motion < 40) p_rc->baseline_gf_interval = 8; } // Update the segmentation map, and related quantities: cyclic refresh map, @@ -282,10 +319,10 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) { int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; int xmis, ymis, x, y; memset(seg_map, CR_SEGMENT_ID_BASE, mi_params->mi_rows * mi_params->mi_cols); - sb_cols = (mi_params->mi_cols + cm->seq_params.mib_size - 1) / - cm->seq_params.mib_size; - sb_rows = (mi_params->mi_rows + cm->seq_params.mib_size - 1) / - cm->seq_params.mib_size; + sb_cols = (mi_params->mi_cols + cm->seq_params->mib_size - 1) / + cm->seq_params->mib_size; + sb_rows = (mi_params->mi_rows + cm->seq_params->mib_size - 1) / + cm->seq_params->mib_size; sbs_in_frame = sb_cols * sb_rows; // Number of target blocks to get the q delta (segment 1). block_count = @@ -302,8 +339,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) { // Get the mi_row/mi_col corresponding to superblock index i. int sb_row_index = (i / sb_cols); int sb_col_index = i - sb_row_index * sb_cols; - int mi_row = sb_row_index * cm->seq_params.mib_size; - int mi_col = sb_col_index * cm->seq_params.mib_size; + int mi_row = sb_row_index * cm->seq_params->mib_size; + int mi_col = sb_col_index * cm->seq_params->mib_size; // TODO(any): Ensure the population of // cpi->common.features.allow_screen_content_tools and use the same instead // of cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN @@ -315,8 +352,8 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) { assert(mi_col >= 0 && mi_col < mi_params->mi_cols); bl_index = mi_row * mi_params->mi_cols + mi_col; // Loop through all MI blocks in superblock and update map. - xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params.mib_size); - ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params.mib_size); + xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params->mib_size); + ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params->mib_size); // cr_map only needed at 8x8 blocks. for (y = 0; y < ymis; y += 2) { for (x = 0; x < xmis; x += 2) { @@ -361,11 +398,20 @@ void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) { int qp_thresh = AOMMIN(20, rc->best_quality << 1); int qp_max_thresh = 118 * MAXQ >> 7; cr->apply_cyclic_refresh = 1; + int avg_frame_qindex_inter_frame; +#if CONFIG_FRAME_PARALLEL_ENCODE + avg_frame_qindex_inter_frame = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? cpi->ppi->temp_avg_frame_qindex[INTER_FRAME] + : rc->avg_frame_qindex[INTER_FRAME]; +#else + avg_frame_qindex_inter_frame = rc->avg_frame_qindex[INTER_FRAME]; +#endif // CONFIG_FRAME_PARALLEL_ENCODE if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) || cpi->svc.temporal_layer_id > 0 || - rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || + avg_frame_qindex_inter_frame < qp_thresh || (rc->frames_since_key > 20 && - rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) || + avg_frame_qindex_inter_frame > qp_max_thresh) || (rc->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) { cr->apply_cyclic_refresh = 0; return; @@ -446,7 +492,7 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { return; } else { const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); aom_clear_system_state(); // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). diff --git a/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.h b/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.h index 97bd6f26b1..1c0d5cb4d7 100644 --- a/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.h +++ b/third_party/libaom/source/libaom/av1/encoder/aq_cyclicrefresh.h @@ -161,6 +161,30 @@ int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi, int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i, double correction_factor); +/*!\brief Update segment_id for blocks are skipped. + * + * After encoding a given prediction block, of size bsize at (mi_row, mi_col), + * check if we should reset the segment_id based on skip_txfm, + * and update the cyclic_refresh map and segmentation counters. + * + * \ingroup cyclic_refresh + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] x Pointer to MACROBLOCK structure + * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE + * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE + * \param[in] bsize Block size + * + * \return Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and + * the \c cm->cpi->enc_seg.map. + */ + +void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi, + MACROBLOCK *const x, int mi_row, int mi_col, + BLOCK_SIZE bsize); + /*!\brief Update segment_id for block based on mode selected. * * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), diff --git a/third_party/libaom/source/libaom/av1/encoder/aq_variance.c b/third_party/libaom/source/libaom/av1/encoder/aq_variance.c index 92d7ad172d..79bf9f8419 100644 --- a/third_party/libaom/source/libaom/av1/encoder/aq_variance.c +++ b/third_party/libaom/source/libaom/av1/encoder/aq_variance.c @@ -52,7 +52,7 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { int resolution_change = cm->prev_frame && (cm->width != cm->prev_frame->width || cm->height != cm->prev_frame->height); - int avg_energy = (int)(cpi->twopass.mb_av_energy - 2); + int avg_energy = (int)(cpi->ppi->twopass.mb_av_energy - 2); double avg_ratio; if (avg_energy > 7) avg_energy = 7; if (avg_energy < 0) avg_energy = 0; @@ -81,7 +81,7 @@ void av1_vaq_frame_setup(AV1_COMP *cpi) { int qindex_delta = av1_compute_qdelta_by_rate( &cpi->rc, cm->current_frame.frame_type, base_qindex, rate_ratio[i] / avg_ratio, cpi->is_screen_content_type, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); // We don't allow qindex 0 in a segment if the base value is not 0. // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment @@ -126,14 +126,14 @@ int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { for (j = 0; j < bw; j += 4) { if (is_cur_buf_hbd(xd)) { var += - log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( + log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf( x->plane[0].src.buf + i * x->plane[0].src.stride + j, x->plane[0].src.stride, CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) / 16); } else { var += - log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( + log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf( x->plane[0].src.buf + i * x->plane[0].src.stride + j, x->plane[0].src.stride, av1_all_zeros, 0, &sse) / 16); @@ -154,15 +154,12 @@ static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) { MACROBLOCKD *xd = &x->e_mbd; int stride = x->plane[0].src.stride; uint8_t *buf = x->plane[0].src.buf; - const int bw = MI_SIZE * mi_size_wide[bs]; - const int bh = MI_SIZE * mi_size_high[bs]; + const int num_8x8_cols = block_size_wide[bs] / 8; + const int num_8x8_rows = block_size_high[bs] / 8; const int hbd = is_cur_buf_hbd(xd); - int var = 0; - for (int r = 0; r < bh; r += 8) - for (int c = 0; c < bw; c += 8) { - var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd); - } + int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows, + num_8x8_cols); return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs]; } @@ -178,7 +175,7 @@ int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x, double energy, energy_midpoint; aom_clear_system_state(); energy_midpoint = (is_stat_consumption_stage_twopass(cpi)) - ? cpi->twopass.frame_avg_haar_energy + ? cpi->ppi->twopass.frame_avg_haar_energy : DEFAULT_E_MIDPOINT; energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint; return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); @@ -199,7 +196,7 @@ int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi, int qindex_delta = av1_compute_qdelta_by_rate( &cpi->rc, cm->current_frame.frame_type, base_qindex, deltaq_rate_ratio[rate_level], cpi->is_screen_content_type, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) { qindex_delta = -base_qindex + 1; diff --git a/third_party/libaom/source/libaom/av1/encoder/av1_noise_estimate.c b/third_party/libaom/source/libaom/av1/encoder/av1_noise_estimate.c index dbc86c5034..8b2fc38923 100644 --- a/third_party/libaom/source/libaom/av1/encoder/av1_noise_estimate.c +++ b/third_party/libaom/source/libaom/av1/encoder/av1_noise_estimate.c @@ -27,8 +27,8 @@ #if CONFIG_AV1_TEMPORAL_DENOISING // For SVC: only do noise estimation on top spatial layer. static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) { - return (!cpi->use_svc || - (cpi->use_svc && + return (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); } #endif @@ -61,7 +61,7 @@ static int enable_noise_estimation(AV1_COMP *const cpi) { cpi->common.height != resize_pending_params->height)); #if CONFIG_AV1_HIGHBITDEPTH - if (cpi->common.seq_params.use_highbitdepth) return 0; + if (cpi->common.seq_params->use_highbitdepth) return 0; #endif // Enable noise estimation if denoising is on. #if CONFIG_AV1_TEMPORAL_DENOISING @@ -75,7 +75,7 @@ static int enable_noise_estimation(AV1_COMP *const cpi) { // Not enabled for low resolutions. if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && - resize_pending == 0 && !cpi->use_svc && + resize_pending == 0 && !cpi->ppi->use_svc && cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && cpi->common.width * cpi->common.height >= 640 * 360) return 1; @@ -227,7 +227,7 @@ void av1_update_noise_estimate(AV1_COMP *const cpi) { unsigned int sse; // Compute variance between co-located blocks from current and // last input frames. - unsigned int variance = cpi->fn_ptr[bsize].vf( + unsigned int variance = cpi->ppi->fn_ptr[bsize].vf( src_y, src_ystride, last_src_y, last_src_ystride, &sse); unsigned int hist_index = variance / bin_size; if (hist_index < MAX_VAR_HIST_BINS) diff --git a/third_party/libaom/source/libaom/av1/encoder/av1_quantize.c b/third_party/libaom/source/libaom/av1/encoder/av1_quantize.c index 9d38e2d77d..2b07e4c71b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/av1_quantize.c +++ b/third_party/libaom/source/libaom/av1/encoder/av1_quantize.c @@ -33,6 +33,40 @@ void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, *eob_ptr = 0; } +int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2], + const int16_t dequant_ptr[2], + const int16_t round_ptr[2], int log_scale, + const int16_t *scan, int coeff_count, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr)); + const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; + int eob = 0; + for (int i = 0; i < coeff_count; i++) { + const int rc = scan[i]; + const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]); + const int coeff = coeff_ptr[rc]; + const int coeff_sign = AOMSIGN(coeff); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32 = 0; + if ((abs_coeff << (1 + log_scale)) >= thresh) { + abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX); + tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); + if (tmp32) { + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = + (tmp32 * dequant_ptr[rc != 0]) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; + } + } + if (tmp32) eob = i + 1; + } + return eob; +} + static void quantize_fp_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -53,26 +87,9 @@ static void quantize_fp_helper_c( memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (qm_ptr == NULL && iqm_ptr == NULL) { - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]); - const int coeff = coeff_ptr[rc]; - const int coeff_sign = AOMSIGN(coeff); - int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp32 = 0; - if ((abs_coeff << (1 + log_scale)) >= thresh) { - abs_coeff = - clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX); - tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); - if (tmp32) { - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - const tran_low_t abs_dqcoeff = - (tmp32 * dequant_ptr[rc != 0]) >> log_scale; - dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; - } - } - if (tmp32) eob = i; - } + *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr, + log_scale, scan, (int)n_coeffs, + coeff_ptr, qcoeff_ptr, dqcoeff_ptr); } else { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. @@ -100,8 +117,8 @@ static void quantize_fp_helper_c( if (tmp32) eob = i; } + *eob_ptr = eob + 1; } - *eob_ptr = eob + 1; } #if CONFIG_AV1_HIGHBITDEPTH @@ -767,7 +784,7 @@ void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel, aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q, min_qmlevel, max_qmlevel); - if (!cm->seq_params.separate_uv_delta_q) + if (!cm->seq_params->separate_uv_delta_q) quant_params->qmatrix_level_v = quant_params->qmatrix_level_u; else quant_params->qmatrix_level_v = diff --git a/third_party/libaom/source/libaom/av1/encoder/av1_quantize.h b/third_party/libaom/source/libaom/av1/encoder/av1_quantize.h index ad9619747a..215feb0603 100644 --- a/third_party/libaom/source/libaom/av1/encoder/av1_quantize.h +++ b/third_party/libaom/source/libaom/av1/encoder/av1_quantize.h @@ -118,6 +118,32 @@ int av1_qindex_to_quantizer(int qindex); void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr); +/*!\brief Quantize transform coefficients without using qmatrix + * + * quant_ptr, dequant_ptr and round_ptr are size 2 arrays, + * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs. + * + * \param[in] quant_ptr 16-bit fixed point representation of inverse + * quantize step size, i.e. 2^16/dequant + * \param[in] dequant_ptr quantize step size + * \param[in] round_ptr rounding + * \param[in] log_scale the relative log scale of the transform + * coefficients + * \param[in] scan scan[i] indicates the position of ith to-be-coded + * coefficient + * \param[in] coeff_count number of coefficients + * \param[out] qcoeff_ptr quantized coefficients + * \param[out] dqcoeff_ptr dequantized coefficients + * + * \return The last non-zero coefficient's scan index plus 1 + */ +int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2], + const int16_t dequant_ptr[2], + const int16_t round_ptr[2], int log_scale, + const int16_t *scan, int coeff_count, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr); + void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, diff --git a/third_party/libaom/source/libaom/av1/encoder/av1_temporal_denoiser.c b/third_party/libaom/source/libaom/av1/encoder/av1_temporal_denoiser.c index 6c5bb930e1..96f3d7dcfe 100644 --- a/third_party/libaom/source/libaom/av1/encoder/av1_temporal_denoiser.c +++ b/third_party/libaom/source/libaom/av1/encoder/av1_temporal_denoiser.c @@ -349,7 +349,7 @@ void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers, cpi->source->y_width, cpi->svc.ref_idx[0], cpi->svc.ref_idx[3], - cpi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref); + cpi->ppi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref); if (decision == FILTER_BLOCK) { decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start, @@ -415,7 +415,7 @@ void av1_denoiser_update_frame_info( return; } - if (svc->external_ref_frame_config) { + if (svc->set_ref_frame_config) { int i; for (i = 0; i < REF_FRAMES; i++) { if (svc->refresh[svc->spatial_layer_id] & (1 << i)) @@ -485,8 +485,8 @@ static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm, if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) { fail = aom_alloc_frame_buffer( &denoiser->running_avg_y[fb_idx], cm->width, cm->height, - cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, - cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->features.byte_alignment); if (fail) { av1_denoiser_free(denoiser); @@ -501,7 +501,7 @@ int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser, int refresh_alt, int refresh_gld, int refresh_lst, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) { int fail = 0; - if (svc->external_ref_frame_config) { + if (svc->set_ref_frame_config) { int i; for (i = 0; i < REF_FRAMES; i++) { if (cm->current_frame.frame_type == KEY_FRAME || @@ -724,7 +724,7 @@ void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) { (cpi->common.width != cpi->resize_pending_params.width || cpi->common.height != cpi->resize_pending_params.height)); - if (cpi->use_svc) { + if (cpi->ppi->use_svc) { // TODO(kyslov) Enable when SVC temporal denosing is implemented #if 0 const int svc_buf_shift = @@ -746,7 +746,7 @@ void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) { cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, cpi->lst_fb_idx)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to re-allocate denoiser for SVC"); #endif } diff --git a/third_party/libaom/source/libaom/av1/encoder/bitstream.c b/third_party/libaom/source/libaom/av1/encoder/bitstream.c index 2b583790ff..85c0183b17 100644 --- a/third_party/libaom/source/libaom/av1/encoder/bitstream.c +++ b/third_party/libaom/source/libaom/av1/encoder/bitstream.c @@ -41,6 +41,7 @@ #include "av1/encoder/cost.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encodetxb.h" +#include "av1/encoder/ethread.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/palette.h" #include "av1/encoder/segmentation.h" @@ -185,12 +186,13 @@ static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd, } assert(bsw > 0 && bsh > 0); - for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) + for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { + const int offsetr = blk_row + row; for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { - int offsetr = blk_row + row; - int offsetc = blk_col + col; + const int offsetc = blk_col + col; write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w); } + } } } @@ -313,14 +315,16 @@ static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd, static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd, int lf_id, - int delta_lflevel, aom_writer *w) { + int delta_lflevel, + int delta_lf_multi, aom_writer *w) { int sign = delta_lflevel < 0; int abs = sign ? -delta_lflevel : delta_lflevel; int rem_bits, thr; int smallval = abs < DELTA_LF_SMALL ? 1 : 0; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + (void)cm; - if (cm->delta_q_info.delta_lf_multi) { + if (delta_lf_multi) { assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2)); aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), @@ -380,7 +384,6 @@ static AOM_INLINE void pack_txb_tokens( #if CONFIG_RD_DEBUG TOKEN_STATS tmp_token_stats; init_token_stats(&tmp_token_stats); - token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost; token_stats->cost += tmp_token_stats.cost; #endif } else { @@ -388,14 +391,17 @@ static AOM_INLINE void pack_txb_tokens( const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int step = bsh * bsw; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); assert(bsw > 0 && bsh > 0); - for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) { - for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) { - const int offsetr = blk_row + r; + for (int r = 0; r < row_end; r += bsh) { + const int offsetr = blk_row + r; + for (int c = 0; c < col_end; c += bsw) { const int offsetc = blk_col + c; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize, bit_depth, block, offsetr, offsetc, sub_txs, token_stats); @@ -445,7 +451,7 @@ int av1_neg_interleave(int x, int ref, int max) { } } -static AOM_INLINE void write_segment_id(AV1_COMP *cpi, +static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd, const MB_MODE_INFO *const mbmi, aom_writer *w, const struct segmentation *seg, @@ -454,7 +460,6 @@ static AOM_INLINE void write_segment_id(AV1_COMP *cpi, if (!seg->enabled || !seg->update_map) return; AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; int cdf_num; const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num); const int mi_row = xd->mi_row; @@ -613,8 +618,8 @@ static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta, } static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm, - const MACROBLOCKD *xd, - aom_writer *w) { + ThreadData *td, aom_writer *w) { + const MACROBLOCKD *xd = &td->mb.e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; @@ -633,8 +638,8 @@ static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm, av1_extract_interp_filter(mbmi->interp_filters, dir); aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS); - ++cm->cur_frame->interp_filter_selected[filter]; - if (cm->seq_params.enable_dual_filter == 0) return; + ++td->interp_filter_selected[filter]; + if (cm->seq_params->enable_dual_filter == 0) return; } } } @@ -777,7 +782,7 @@ static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm, aom_write_symbol(w, n - PALETTE_MIN_SIZE, xd->tile_ctx->palette_y_size_cdf[bsize_ctx], PALETTE_SIZES); - write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w); + write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w); } } @@ -792,7 +797,7 @@ static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm, aom_write_symbol(w, n - PALETTE_MIN_SIZE, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx], PALETTE_SIZES); - write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w); + write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w); } } } @@ -874,7 +879,7 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, // At the start of a superblock, mark that we haven't yet written CDEF // strengths for any of the CDEF units contained in this superblock. - const int sb_mask = (cm->seq_params.mib_size - 1); + const int sb_mask = (cm->seq_params->mib_size - 1); const int mi_row_in_sb = (xd->mi_row & sb_mask); const int mi_col_in_sb = (xd->mi_col & sb_mask); if (mi_row_in_sb == 0 && mi_col_in_sb == 0) { @@ -889,7 +894,7 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, const int index_mask = cdef_size; const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0); const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0); - const int index = (cm->seq_params.sb_size == BLOCK_128X128) + const int index = (cm->seq_params->sb_size == BLOCK_128X128) ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb : 0; @@ -909,9 +914,9 @@ static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, } static AOM_INLINE void write_inter_segment_id( - AV1_COMP *cpi, aom_writer *w, const struct segmentation *const seg, - struct segmentation_probs *const segp, int skip, int preskip) { - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w, + const struct segmentation *const seg, struct segmentation_probs *const segp, + int skip, int preskip) { MB_MODE_INFO *const mbmi = xd->mi[0]; AV1_COMMON *const cm = &cpi->common; const int mi_row = xd->mi_row; @@ -923,7 +928,7 @@ static AOM_INLINE void write_inter_segment_id( } else { if (seg->segid_preskip) return; if (skip) { - write_segment_id(cpi, mbmi, w, seg, segp, 1); + write_segment_id(cpi, xd, mbmi, w, seg, segp, 1); if (seg->temporal_update) mbmi->seg_id_predicted = 0; return; } @@ -933,35 +938,33 @@ static AOM_INLINE void write_inter_segment_id( aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd); aom_write_symbol(w, pred_flag, pred_cdf, 2); if (!pred_flag) { - write_segment_id(cpi, mbmi, w, seg, segp, 0); + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); } if (pred_flag) { set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize, mi_row, mi_col, mbmi->segment_id); } } else { - write_segment_id(cpi, mbmi, w, seg, segp, 0); + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); } } } // If delta q is present, writes delta_q index. // Also writes delta_q loop filter levels, if present. -static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip, +static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm, + MACROBLOCKD *const xd, int skip, aom_writer *w) { - AV1_COMMON *const cm = &cpi->common; const DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag) { - MACROBLOCK *const x = &cpi->td.mb; - MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; const int super_block_upper_left = - ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) && - ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0); + ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0); - if ((bsize != cm->seq_params.sb_size || skip == 0) && + if ((bsize != cm->seq_params->sb_size || skip == 0) && super_block_upper_left) { assert(mbmi->current_qindex > 0); const int reduced_delta_qindex = @@ -977,14 +980,14 @@ static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip, int reduced_delta_lflevel = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / delta_q_info->delta_lf_res; - write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w); + write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w); xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id]; } } else { int reduced_delta_lflevel = (mbmi->delta_lf_from_base - xd->delta_lf_from_base) / delta_q_info->delta_lf_res; - write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w); + write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w); xd->delta_lf_from_base = mbmi->delta_lf_from_base; } } @@ -992,12 +995,10 @@ static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip, } } -static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi, +static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm, + MACROBLOCKD *const xd, int is_keyframe, aom_writer *w) { - const AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->td.mb; - MACROBLOCKD *const xd = &x->e_mbd; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const MB_MODE_INFO *const mbmi = xd->mi[0]; const PREDICTION_MODE mode = mbmi->mode; @@ -1020,7 +1021,7 @@ static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi, } // UV mode and UV angle delta. - if (!cm->seq_params.monochrome && xd->is_chroma_ref) { + if (!cm->seq_params->monochrome && xd->is_chroma_ref) { const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode; write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w); if (uv_mode == UV_CFL_PRED) @@ -1082,9 +1083,10 @@ static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) { x->mbmi_ext_frame); } -static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) { +static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td, + aom_writer *w) { AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->td.mb; + MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; FRAME_CONTEXT *ec_ctx = xd->tile_ctx; const struct segmentation *const seg = &cm->seg; @@ -1099,7 +1101,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) { const int is_compound = has_second_ref(mbmi); int ref; - write_inter_segment_id(cpi, w, seg, segp, 0, 1); + write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1); write_skip_mode(cm, xd, segment_id, mbmi, w); @@ -1107,18 +1109,18 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) { const int skip = mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w); - write_inter_segment_id(cpi, w, seg, segp, skip, 0); + write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0); write_cdef(cm, xd, w, skip); - write_delta_q_params(cpi, skip, w); + write_delta_q_params(cm, xd, skip, w); if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter); if (mbmi->skip_mode) return; if (!is_inter) { - write_intra_prediction_modes(cpi, 0, w); + write_intra_prediction_modes(cm, xd, 0, w); } else { int16_t mode_ctx; @@ -1146,21 +1148,23 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) { for (ref = 0; ref < 1 + is_compound; ++ref) { nmv_context *nmvc = &ec_ctx->nmvc; const int_mv ref_mv = get_ref_mv(x, ref); - av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, + av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc, allow_hp); } } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) { nmv_context *nmvc = &ec_ctx->nmvc; const int_mv ref_mv = get_ref_mv(x, 1); - av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp); + av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) { nmv_context *nmvc = &ec_ctx->nmvc; const int_mv ref_mv = get_ref_mv(x, 0); - av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp); + av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, + allow_hp); } if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE && - cpi->common.seq_params.enable_interintra_compound && + cpi->common.seq_params->enable_interintra_compound && is_interintra_allowed(mbmi)) { const int interintra = mbmi->ref_frame[1] == INTRA_FRAME; const int bsize_group = size_group_lookup[bsize]; @@ -1187,7 +1191,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) { // Group B (1): interintra, compound_diffwtd, wedge if (has_second_ref(mbmi)) { const int masked_compound_used = is_any_masked_compound_used(bsize) && - cm->seq_params.enable_masked_compound; + cm->seq_params->enable_masked_compound; if (masked_compound_used) { const int ctx_comp_group_idx = get_comp_group_idx_context(xd); @@ -1201,7 +1205,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) { if (mbmi->compound_idx) assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE); - if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) { + if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) { const int comp_index_ctx = get_comp_index_context(cm, xd); aom_write_symbol(w, mbmi->compound_idx, ec_ctx->compound_index_cdf[comp_index_ctx], 2); @@ -1234,7 +1238,7 @@ static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) { } } } - write_mb_interp_filter(cm, xd, w); + write_mb_interp_filter(cm, td, w); } } @@ -1264,23 +1268,23 @@ static AOM_INLINE void write_mb_modes_kf( const MB_MODE_INFO *const mbmi = xd->mi[0]; if (seg->segid_preskip && seg->update_map) - write_segment_id(cpi, mbmi, w, seg, segp, 0); + write_segment_id(cpi, xd, mbmi, w, seg, segp, 0); const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w); if (!seg->segid_preskip && seg->update_map) - write_segment_id(cpi, mbmi, w, seg, segp, skip); + write_segment_id(cpi, xd, mbmi, w, seg, segp, skip); write_cdef(cm, xd, w, skip); - write_delta_q_params(cpi, skip, w); + write_delta_q_params(cm, xd, skip, w); if (av1_allow_intrabc(cm)) { write_intrabc_info(xd, mbmi_ext_frame, w); if (is_intrabc_block(mbmi)) return; } - write_intra_prediction_modes(cpi, 1, w); + write_intra_prediction_modes(cm, xd, 1, w); } #if CONFIG_RD_DEBUG @@ -1295,24 +1299,8 @@ static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) { static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, int plane) { if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) { - int r, c; printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n", plane, rd_stats->txb_coeff_cost[plane], token_stats->cost); - printf("rd txb_coeff_cost_map\n"); - for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { - for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { - printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]); - } - printf("\n"); - } - - printf("pack txb_coeff_cost_map\n"); - for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { - for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { - printf("%d ", token_stats->txb_coeff_cost_map[r][c]); - } - printf("\n"); - } return 1; } return 0; @@ -1376,13 +1364,14 @@ static AOM_INLINE void enc_dump_logs( } #endif // ENC_MISMATCH_DEBUG -static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) { +static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td, + aom_writer *w) { AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + MACROBLOCKD *const xd = &td->mb.e_mbd; MB_MODE_INFO *m = xd->mi[0]; if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext_frame, w); + write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w); } else { // has_subpel_mv_component needs the ref frame buffers set up to look // up if they are scaled. has_subpel_mv_component is in turn needed by @@ -1393,7 +1382,7 @@ static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) { enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col); #endif // ENC_MISMATCH_DEBUG - pack_inter_mode_mvs(cpi, w); + pack_inter_mode_mvs(cpi, td, w); } } @@ -1426,18 +1415,17 @@ static AOM_INLINE void write_inter_txb_coeff( for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) { for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) { pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize, - cm->seq_params.bit_depth, *block, blk_row, blk_col, + cm->seq_params->bit_depth, *block, blk_row, blk_col, max_tx_size, token_stats); *block += step; } } } -static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w, - const TokenExtra **tok, +static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x, + aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end) { AV1_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; @@ -1487,17 +1475,18 @@ static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w, } } -static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, - aom_writer *w, const TokenExtra **tok, +static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td, + const TileInfo *const tile, aom_writer *w, + const TokenExtra **tok, const TokenExtra *const tok_end, int mi_row, int mi_col) { const AV1_COMMON *cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; - MACROBLOCKD *xd = &cpi->td.mb.e_mbd; + MACROBLOCKD *xd = &td->mb.e_mbd; FRAME_CONTEXT *tile_ctx = xd->tile_ctx; const int grid_idx = mi_row * mi_params->mi_stride + mi_col; xd->mi = mi_params->mi_grid_base + grid_idx; - cpi->td.mb.mbmi_ext_frame = + td->mb.mbmi_ext_frame = cpi->mbmi_ext_info.frame_base + get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize, cpi->mbmi_ext_info.stride); @@ -1506,7 +1495,7 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, const MB_MODE_INFO *mbmi = xd->mi[0]; const BLOCK_SIZE bsize = mbmi->bsize; - assert(bsize <= cm->seq_params.sb_size || + assert(bsize <= cm->seq_params->sb_size || (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL)); const int bh = mi_size_high[bsize]; @@ -1518,7 +1507,7 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, xd->left_txfm_context = xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); - write_mbmi_b(cpi, w); + write_mbmi_b(cpi, td, w); for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) { const uint8_t palette_size_plane = @@ -1567,10 +1556,10 @@ static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile, if (!mbmi->skip_txfm) { int start = aom_tell_size(w); - write_tokens_b(cpi, w, tok, tok_end); + write_tokens_b(cpi, &td->mb, w, tok, tok_end); const int end = aom_tell_size(w); - cpi->rc.coefficient_size += end - start; + td->coefficient_size += end - start; } } @@ -1612,12 +1601,12 @@ static AOM_INLINE void write_partition(const AV1_COMMON *const cm, } static AOM_INLINE void write_modes_sb( - AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w, - const TokenExtra **tok, const TokenExtra *const tok_end, int mi_row, - int mi_col, BLOCK_SIZE bsize) { + AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile, + aom_writer *const w, const TokenExtra **tok, + const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + MACROBLOCKD *const xd = &td->mb.e_mbd; assert(bsize < BLOCK_SIZES_ALL); const int hbs = mi_size_wide[bsize] / 2; const int quarter_step = mi_size_wide[bsize] / 4; @@ -1639,8 +1628,7 @@ static AOM_INLINE void write_modes_sb( const int runit_idx = rcol + rrow * rstride; const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx]; - loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, - cpi->td.counts); + loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, td->counts); } } } @@ -1650,51 +1638,53 @@ static AOM_INLINE void write_modes_sb( write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w); switch (partition) { case PARTITION_NONE: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); break; case PARTITION_HORZ: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); if (mi_row + hbs < mi_params->mi_rows) - write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); break; case PARTITION_VERT: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); if (mi_col + hbs < mi_params->mi_cols) - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); break; case PARTITION_SPLIT: - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs, + subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col, + subsize); + write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs, subsize); break; case PARTITION_HORZ_A: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); - write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); break; case PARTITION_HORZ_B: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); - write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); break; case PARTITION_VERT_A: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col); - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); break; case PARTITION_VERT_B: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs); - write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs); break; case PARTITION_HORZ_4: for (i = 0; i < 4; ++i) { int this_mi_row = mi_row + i * quarter_step; if (i > 0 && this_mi_row >= mi_params->mi_rows) break; - write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col); } break; case PARTITION_VERT_4: @@ -1702,7 +1692,7 @@ static AOM_INLINE void write_modes_sb( int this_mi_col = mi_col + i * quarter_step; if (i > 0 && this_mi_col >= mi_params->mi_cols) break; - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col); + write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col); } break; default: assert(0); @@ -1712,12 +1702,12 @@ static AOM_INLINE void write_modes_sb( update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); } -static AOM_INLINE void write_modes(AV1_COMP *const cpi, +static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile, aom_writer *const w, int tile_row, int tile_col) { AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + MACROBLOCKD *const xd = &td->mb.e_mbd; const int mi_row_start = tile->mi_row_start; const int mi_row_end = tile->mi_row_end; const int mi_col_start = tile->mi_col_start; @@ -1735,9 +1725,9 @@ static AOM_INLINE void write_modes(AV1_COMP *const cpi, } for (int mi_row = mi_row_start; mi_row < mi_row_end; - mi_row += cm->seq_params.mib_size) { + mi_row += cm->seq_params->mib_size) { const int sb_row_in_tile = - (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2; + (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2; const TokenExtra *tok = cpi->token_info.tplist[tile_row][tile_col][sb_row_in_tile].start; const TokenExtra *tok_end = @@ -1746,10 +1736,10 @@ static AOM_INLINE void write_modes(AV1_COMP *const cpi, av1_zero_left_context(xd); for (int mi_col = mi_col_start; mi_col < mi_col_end; - mi_col += cm->seq_params.mib_size) { - cpi->td.mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); - write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col, - cm->seq_params.sb_size); + mi_col += cm->seq_params->mib_size) { + td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col); + write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col, + cm->seq_params->sb_size); } assert(tok == tok_end); } @@ -1758,7 +1748,7 @@ static AOM_INLINE void write_modes(AV1_COMP *const cpi, static AOM_INLINE void encode_restoration_mode( AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { assert(!cm->features.all_lossless); - if (!cm->seq_params.enable_restoration) return; + if (!cm->seq_params->enable_restoration) return; if (cm->features.allow_intrabc) return; const int num_planes = av1_num_planes(cm); int all_none = 1, chroma_none = 1; @@ -1789,9 +1779,9 @@ static AOM_INLINE void encode_restoration_mode( } } if (!all_none) { - assert(cm->seq_params.sb_size == BLOCK_64X64 || - cm->seq_params.sb_size == BLOCK_128X128); - const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64; + assert(cm->seq_params->sb_size == BLOCK_64X64 || + cm->seq_params->sb_size == BLOCK_128X128); + const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64; RestorationInfo *rsi = &cm->rst_info[0]; @@ -1807,7 +1797,8 @@ static AOM_INLINE void encode_restoration_mode( } if (num_planes > 1) { - int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y); + int s = + AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y); if (s && !chroma_none) { aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size != cm->rst_info[0].restoration_unit_size); @@ -2040,7 +2031,7 @@ static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm, static AOM_INLINE void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { assert(!cm->features.coded_lossless); - if (!cm->seq_params.enable_cdef) return; + if (!cm->seq_params->enable_cdef) return; if (cm->features.allow_intrabc) return; const int num_planes = av1_num_planes(cm); int i; @@ -2093,7 +2084,7 @@ static AOM_INLINE void encode_quantization( } } -static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd, +static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { int i, j; struct segmentation *seg = &cm->seg; @@ -2102,17 +2093,9 @@ static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd, if (!seg->enabled) return; // Write update flags - if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { - assert(seg->update_map == 1); - seg->temporal_update = 0; - assert(seg->update_data == 1); - } else { + if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) { aom_wb_write_bit(wb, seg->update_map); - if (seg->update_map) { - // Select the coding strategy (temporal or spatial) - av1_choose_segmap_coding_method(cm, xd); - aom_wb_write_bit(wb, seg->temporal_update); - } + if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update); aom_wb_write_bit(wb, seg->update_data); } @@ -2163,11 +2146,11 @@ static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, static AOM_INLINE void write_tile_info_max_tile( const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { int width_mi = - ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params.mib_size_log2); + ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2); int height_mi = - ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2); - int width_sb = width_mi >> cm->seq_params.mib_size_log2; - int height_sb = height_mi >> cm->seq_params.mib_size_log2; + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); + int width_sb = width_mi >> cm->seq_params->mib_size_log2; + int height_sb = height_mi >> cm->seq_params->mib_size_log2; int size_sb, i; const CommonTileParams *const tiles = &cm->tiles; @@ -2244,13 +2227,6 @@ static AOM_INLINE void write_ext_tile_info( } } -// Stores the location and size of a tile's data in the bitstream. Used for -// later identifying identical tiles -typedef struct TileBufferEnc { - uint8_t *data; - size_t size; -} TileBufferEnc; - static INLINE int find_identical_tile( const int tile_row, const int tile_col, TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) { @@ -2314,7 +2290,7 @@ static AOM_INLINE void write_render_size(const AV1_COMMON *cm, static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; if (!seq_params->enable_superres) { assert(cm->superres_scale_denominator == SCALE_NUMERATOR); return; @@ -2341,7 +2317,7 @@ static AOM_INLINE void write_frame_size(const AV1_COMMON *cm, const int coded_height = cm->superres_upscaled_height - 1; if (frame_size_override) { - const SequenceHeader *seq_params = &cm->seq_params; + const SequenceHeader *seq_params = cm->seq_params; int num_bits_width = seq_params->num_bits_width; int num_bits_height = seq_params->num_bits_height; aom_wb_write_literal(wb, coded_width, num_bits_width); @@ -2499,7 +2475,7 @@ static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { aom_wb_write_unsigned_literal( wb, cm->frame_presentation_time, - cm->seq_params.decoder_model_info.frame_presentation_time_length); + cm->seq_params->decoder_model_info.frame_presentation_time_length); } static AOM_INLINE void write_film_grain_params( @@ -2537,15 +2513,15 @@ static AOM_INLINE void write_film_grain_params( aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8); } - if (!cm->seq_params.monochrome) { + if (!cm->seq_params->monochrome) { aom_wb_write_bit(wb, pars->chroma_scaling_from_luma); } else { assert(!pars->chroma_scaling_from_luma); } - if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma || - ((cm->seq_params.subsampling_x == 1) && - (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) { + if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma || + ((cm->seq_params->subsampling_x == 1) && + (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) { assert(pars->num_cb_points == 0 && pars->num_cr_points == 0); } else { aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10 @@ -2841,12 +2817,11 @@ static int check_frame_refs_short_signaling(AV1_COMMON *const cm) { // New function based on HLS R18 static AOM_INLINE void write_uncompressed_header_obu( - AV1_COMP *cpi, struct aom_write_bit_buffer *saved_wb, + AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb, struct aom_write_bit_buffer *wb) { AV1_COMMON *const cm = &cpi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const CommonQuantParams *quant_params = &cm->quant_params; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; CurrentFrame *const current_frame = &cm->current_frame; FeatureFlags *const features = &cm->features; @@ -2925,7 +2900,7 @@ static AOM_INLINE void write_uncompressed_header_obu( if (cm->superres_upscaled_width > seq_params->max_frame_width || cm->superres_upscaled_height > seq_params->max_frame_height) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Frame dimensions are larger than the maximum values"); } @@ -2947,24 +2922,24 @@ static AOM_INLINE void write_uncompressed_header_obu( } if (seq_params->decoder_model_info_present_flag) { - aom_wb_write_bit(wb, cm->buffer_removal_time_present); - if (cm->buffer_removal_time_present) { + aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present); + if (cpi->ppi->buffer_removal_time_present) { for (int op_num = 0; op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) { if (seq_params->op_params[op_num].decoder_model_param_present_flag) { - if (((seq_params->operating_point_idc[op_num] >> + if (seq_params->operating_point_idc[op_num] == 0 || + ((seq_params->operating_point_idc[op_num] >> cm->temporal_layer_id) & 0x1 && (seq_params->operating_point_idc[op_num] >> (cm->spatial_layer_id + 8)) & - 0x1) || - seq_params->operating_point_idc[op_num] == 0) { + 0x1)) { aom_wb_write_unsigned_literal( wb, cm->buffer_removal_times[op_num], seq_params->decoder_model_info.buffer_removal_time_length); cm->buffer_removal_times[op_num]++; if (cm->buffer_removal_times[op_num] == 0) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "buffer_removal_time overflowed"); } } @@ -3051,7 +3026,7 @@ static AOM_INLINE void write_uncompressed_header_obu( 1; if (delta_frame_id_minus_1 < 0 || delta_frame_id_minus_1 >= (1 << diff_len)) { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid delta_frame_id_minus_1"); } aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len); @@ -3088,8 +3063,8 @@ static AOM_INLINE void write_uncompressed_header_obu( write_tile_info(cm, saved_wb, wb); encode_quantization(quant_params, av1_num_planes(cm), - cm->seq_params.separate_uv_delta_q, wb); - encode_segmentation(cm, xd, wb); + cm->seq_params->separate_uv_delta_q, wb); + encode_segmentation(cm, wb); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0); @@ -3288,11 +3263,11 @@ static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst, } uint32_t av1_write_obu_header(AV1LevelParams *const level_params, - OBU_TYPE obu_type, int obu_extension, - uint8_t *const dst) { + int *frame_header_count, OBU_TYPE obu_type, + int obu_extension, uint8_t *const dst) { if (level_params->keep_level_stats && (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER)) - ++level_params->frame_header_count; + ++(*frame_header_count); struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; @@ -3326,8 +3301,8 @@ int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size, return AOM_CODEC_OK; } -static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size, - uint8_t *data) { +size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size, + uint8_t *data) { const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size); const size_t move_dst_offset = length_field_size + obu_header_size; const size_t move_src_offset = obu_header_size; @@ -3426,12 +3401,12 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, return size; } -static uint32_t write_frame_header_obu(AV1_COMP *cpi, +static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb, uint8_t *const dst, int append_trailing_bits) { struct aom_write_bit_buffer wb = { dst, 0 }; - write_uncompressed_header_obu(cpi, saved_wb, &wb); + write_uncompressed_header_obu(cpi, xd, saved_wb, &wb); if (append_trailing_bits) add_trailing_bits(&wb); return aom_wb_bytes_written(&wb); } @@ -3455,12 +3430,6 @@ static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile, return size; } -typedef struct { - uint8_t *frame_header; - size_t obu_header_byte_offset; - size_t total_length; -} FrameHeaderInfo; - extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size, const char *filename); @@ -3473,16 +3442,17 @@ typedef struct { static uint32_t init_large_scale_tile_obu_header( AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb, LargeTileFrameOBU *lst_obu) { - AV1LevelParams *const level_params = &cpi->level_params; + AV1LevelParams *const level_params = &cpi->ppi->level_params; CurrentFrame *const current_frame = &cpi->common.current_frame; // For large_scale_tile case, we always have only one tile group, so it can // be written as an OBU_FRAME. const OBU_TYPE obu_type = OBU_FRAME; - lst_obu->tg_hdr_size = av1_write_obu_header(level_params, obu_type, 0, *data); + lst_obu->tg_hdr_size = av1_write_obu_header( + level_params, &cpi->frame_header_count, obu_type, 0, *data); *data += lst_obu->tg_hdr_size; const uint32_t frame_header_size = - write_frame_header_obu(cpi, saved_wb, *data, 0); + write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0); *data += frame_header_size; lst_obu->frame_header_size = frame_header_size; // (yunqing) This test ensures the correctness of large scale tile coding. @@ -3520,7 +3490,7 @@ static void write_large_scale_tile_obu_size( *total_size += lst_obu->tg_hdr_size; const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size; const size_t length_field_size = - obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst); + av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst); if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) != AOM_CODEC_OK) assert(0); @@ -3551,6 +3521,7 @@ static void write_large_scale_tile_obu( const int tile_rows = tiles->rows; unsigned int tile_size = 0; + av1_reset_pack_bs_thread_data(&cpi->td); for (int tile_col = 0; tile_col < tile_cols; tile_col++) { TileInfo tile_info; const int is_last_col = (tile_col == tile_cols - 1); @@ -3579,7 +3550,7 @@ static void write_large_scale_tile_obu( mode_bc.allow_update_cdf = mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; aom_start_encode(&mode_bc, buf->data + data_offset); - write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col); + write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col); aom_stop_encode(&mode_bc); tile_size = mode_bc.pos; buf->size = tile_size; @@ -3627,6 +3598,7 @@ static void write_large_scale_tile_obu( *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size); } } + av1_accumulate_pack_bs_thread_data(cpi, &cpi->td); } // Packs information in the obu header for large scale tiles. @@ -3656,147 +3628,236 @@ static INLINE uint32_t pack_large_scale_tiles_in_tg_obus( return total_size; } +// Writes obu, tile group and uncompressed headers to bitstream. +void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd, + PackBSParams *const pack_bs_params, + const int tile_idx) { + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size; + const int tg_size = + (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg; + + // Write Tile group, frame and OBU header + // A new tile group begins at this tile. Write the obu header and + // tile group header + const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP; + *curr_tg_hdr_size = av1_write_obu_header( + &cpi->ppi->level_params, &cpi->frame_header_count, obu_type, + pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr); + pack_bs_params->obu_header_size = *curr_tg_hdr_size; + + if (cpi->num_tg == 1) + *curr_tg_hdr_size += write_frame_header_obu( + cpi, xd, pack_bs_params->saved_wb, + pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0); + *curr_tg_hdr_size += write_tile_group_header( + pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx, + AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1), + (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1); + *pack_bs_params->total_size += *curr_tg_hdr_size; +} + +// Pack tile data in the bitstream with tile_group, frame +// and OBU header. +void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td, + PackBSParams *const pack_bs_params) { + aom_writer mode_bc; + AV1_COMMON *const cm = &cpi->common; + int tile_row = pack_bs_params->tile_row; + int tile_col = pack_bs_params->tile_col; + uint32_t *const total_size = pack_bs_params->total_size; + TileInfo tile_info; + av1_tile_set_col(&tile_info, cm, tile_col); + av1_tile_set_row(&tile_info, cm, tile_row); + mode_bc.allow_update_cdf = 1; + mode_bc.allow_update_cdf = + mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; + + unsigned int tile_size; + + const int num_planes = av1_num_planes(cm); + av1_reset_loop_restoration(&td->mb.e_mbd, num_planes); + + pack_bs_params->buf.data = pack_bs_params->dst + *total_size; + + // The last tile of the tile group does not have a header. + if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4; + + // Pack tile data + aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size); + write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col); + aom_stop_encode(&mode_bc); + tile_size = mode_bc.pos; + assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); + + pack_bs_params->buf.size = tile_size; + + // Write tile size + if (!pack_bs_params->is_last_tile_in_tg) { + // size of this tile + mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES); + } +} + +void av1_write_last_tile_info( + AV1_COMP *const cpi, const FrameHeaderInfo *fh_info, + struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size, + uint8_t *curr_tg_start, uint32_t *const total_size, + uint8_t **tile_data_start, int *const largest_tile_id, + int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) { + // write current tile group size + const uint32_t obu_payload_size = + (uint32_t)(*curr_tg_data_size) - obu_header_size; + const size_t length_field_size = + av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start); + if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, + curr_tg_start) != AOM_CODEC_OK) { + assert(0); + } + *curr_tg_data_size += (int)length_field_size; + *total_size += (uint32_t)length_field_size; + *tile_data_start += length_field_size; + if (cpi->num_tg == 1) { + // if this tg is combined with the frame header then update saved + // frame header base offset according to length field size + saved_wb->bit_buffer += length_field_size; + } + + if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) { + // Make room for a duplicate Frame Header OBU. + memmove(curr_tg_start + fh_info->total_length, curr_tg_start, + *curr_tg_data_size); + + // Insert a copy of the Frame Header OBU. + memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length); + + // Force context update tile to be the first tile in error + // resilient mode as the duplicate frame headers will have + // context_update_tile_id set to 0 + *largest_tile_id = 0; + + // Rewrite the OBU header to change the OBU type to Redundant Frame + // Header. + av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count, + OBU_REDUNDANT_FRAME_HEADER, obu_extn_header, + &curr_tg_start[fh_info->obu_header_byte_offset]); + + *curr_tg_data_size += (int)(fh_info->total_length); + *total_size += (uint32_t)(fh_info->total_length); + } + *is_first_tg = 0; +} + +void av1_reset_pack_bs_thread_data(ThreadData *const td) { + td->coefficient_size = 0; + td->max_mv_magnitude = 0; + av1_zero(td->interp_filter_selected); +} + +void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi, + ThreadData const *td) { + int do_max_mv_magnitude_update = 1; + cpi->rc.coefficient_size += td->coefficient_size; + +#if CONFIG_FRAME_PARALLEL_ENCODE + // Disable max_mv_magnitude update for parallel frames based on update flag. + if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0; +#endif + + if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update) + cpi->mv_search_params.max_mv_magnitude = + AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude); + + for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++) + cpi->common.cur_frame->interp_filter_selected[filter] += + td->interp_filter_selected[filter]; +} + // Store information related to each default tile in the OBU header. static void write_tile_obu( AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, - struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, const FrameHeaderInfo *fh_info, int *const largest_tile_id, unsigned int *max_tile_size, uint32_t *const obu_header_size, uint8_t **tile_data_start) { AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; const CommonTileParams *const tiles = &cm->tiles; - AV1LevelParams *const level_params = &cpi->level_params; - TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; const int tile_cols = tiles->cols; const int tile_rows = tiles->rows; - unsigned int tile_size = 0; // Fixed size tile groups for the moment const int num_tg_hdrs = cpi->num_tg; const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs; int tile_count = 0; - int curr_tg_data_size = 0; - uint8_t *data = dst; + size_t curr_tg_data_size = 0; + uint8_t *tile_data_curr = dst; int new_tg = 1; - int first_tg = 1; + int is_first_tg = 1; + av1_reset_pack_bs_thread_data(&cpi->td); for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { - aom_writer mode_bc; const int tile_idx = tile_row * tile_cols + tile_col; - TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col]; TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; - int is_last_tile_in_tg = 0; + int is_last_tile_in_tg = 0; if (new_tg) { - data = dst + *total_size; - - // A new tile group begins at this tile. Write the obu header and - // tile group header - const OBU_TYPE obu_type = - (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP; - curr_tg_data_size = av1_write_obu_header(level_params, obu_type, - obu_extension_header, data); - *obu_header_size = curr_tg_data_size; - - if (num_tg_hdrs == 1) - curr_tg_data_size += write_frame_header_obu( - cpi, saved_wb, data + curr_tg_data_size, 0); - curr_tg_data_size += write_tile_group_header( - data + curr_tg_data_size, tile_idx, - AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1), - (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1); - *total_size += curr_tg_data_size; - *tile_data_start += curr_tg_data_size; - new_tg = 0; + tile_data_curr = dst + *total_size; tile_count = 0; } tile_count++; - TileInfo tile_info; - av1_tile_set_col(&tile_info, cm, tile_col); - av1_tile_set_row(&tile_info, cm, tile_row); - if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) { + if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) is_last_tile_in_tg = 1; - new_tg = 1; - } else { - is_last_tile_in_tg = 0; - } - buf->data = dst + *total_size; + xd->tile_ctx = &this_tile->tctx; - // The last tile of the tile group does not have a header. - if (!is_last_tile_in_tg) *total_size += 4; + // PackBSParams stores all parameters required to pack tile and header + // info. + PackBSParams pack_bs_params; + pack_bs_params.dst = dst; + pack_bs_params.curr_tg_hdr_size = 0; + pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg; + pack_bs_params.new_tg = new_tg; + pack_bs_params.obu_extn_header = obu_extn_header; + pack_bs_params.obu_header_size = 0; + pack_bs_params.saved_wb = saved_wb; + pack_bs_params.tile_col = tile_col; + pack_bs_params.tile_row = tile_row; + pack_bs_params.tile_data_curr = tile_data_curr; + pack_bs_params.total_size = total_size; - cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; - mode_bc.allow_update_cdf = 1; - mode_bc.allow_update_cdf = - mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; - const int num_planes = av1_num_planes(cm); - av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes); + if (new_tg) + av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx); - aom_start_encode(&mode_bc, dst + *total_size); - write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col); - aom_stop_encode(&mode_bc); - tile_size = mode_bc.pos; - assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); + av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params); - curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4)); - buf->size = tile_size; - if (tile_size > *max_tile_size) { - *largest_tile_id = tile_cols * tile_row + tile_col; - *max_tile_size = tile_size; + if (new_tg) { + curr_tg_data_size = pack_bs_params.curr_tg_hdr_size; + *tile_data_start += pack_bs_params.curr_tg_hdr_size; + *obu_header_size = pack_bs_params.obu_header_size; + new_tg = 0; } + if (is_last_tile_in_tg) new_tg = 1; - if (!is_last_tile_in_tg) { - // size of this tile - mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES); - } else { - // write current tile group size - const uint32_t obu_payload_size = curr_tg_data_size - *obu_header_size; - const size_t length_field_size = - obu_memmove(*obu_header_size, obu_payload_size, data); - if (av1_write_uleb_obu_size(*obu_header_size, obu_payload_size, data) != - AOM_CODEC_OK) { - assert(0); - } - curr_tg_data_size += (int)length_field_size; - *total_size += (uint32_t)length_field_size; - *tile_data_start += length_field_size; - if (num_tg_hdrs == 1) { - // if this tg is combined with the frame header then update saved - // frame header base offset accroding to length field size - saved_wb->bit_buffer += length_field_size; - } + curr_tg_data_size += + (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4)); - if (!first_tg && cm->features.error_resilient_mode) { - // Make room for a duplicate Frame Header OBU. - memmove(data + fh_info->total_length, data, curr_tg_data_size); - - // Insert a copy of the Frame Header OBU. - memcpy(data, fh_info->frame_header, fh_info->total_length); - - // Force context update tile to be the first tile in error - // resiliant mode as the duplicate frame headers will have - // context_update_tile_id set to 0 - *largest_tile_id = 0; - - // Rewrite the OBU header to change the OBU type to Redundant Frame - // Header. - av1_write_obu_header(level_params, OBU_REDUNDANT_FRAME_HEADER, - obu_extension_header, - &data[fh_info->obu_header_byte_offset]); - - data += fh_info->total_length; - - curr_tg_data_size += (int)(fh_info->total_length); - *total_size += (uint32_t)(fh_info->total_length); - } - first_tg = 0; + if (pack_bs_params.buf.size > *max_tile_size) { + *largest_tile_id = tile_idx; + *max_tile_size = (unsigned int)pack_bs_params.buf.size; } - *total_size += tile_size; + if (is_last_tile_in_tg) + av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size, + tile_data_curr, total_size, tile_data_start, + largest_tile_id, &is_first_tg, + *obu_header_size, obu_extn_header); + *total_size += (uint32_t)pack_bs_params.buf.size; } } + av1_accumulate_pack_bs_thread_data(cpi, &cpi->td); } // Write total buffer size and related information into the OBU header for @@ -3854,6 +3915,24 @@ static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst, } } +// As per the experiments, single-thread bitstream packing is better for +// frames with a smaller bitstream size. This behavior is due to setup time +// overhead of multithread function would be more than that of time required +// to pack the smaller bitstream of such frames. We set a threshold on the +// total absolute sum of transform coeffs to detect such frames and disable +// Multithreading. +int enable_pack_bitstream_mt(const TileDataEnc *tile_data, int num_tiles, + int num_workers) { + if (AOMMIN(num_workers, num_tiles) <= 1) return 0; + + const int num_work_sqr = num_workers * num_workers; + const uint64_t thresh = 50; + uint64_t frame_abs_sum_level = 0; + for (int idx = 0; idx < num_tiles; idx++) + frame_abs_sum_level += tile_data[idx].abs_sum_level; + return ((frame_abs_sum_level > (num_work_sqr * thresh) / (num_workers - 1))); +} + static INLINE uint32_t pack_tiles_in_tg_obus( AV1_COMP *const cpi, uint8_t *const dst, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, @@ -3863,16 +3942,25 @@ static INLINE uint32_t pack_tiles_in_tg_obus( unsigned int max_tile_size = 0; uint32_t obu_header_size = 0; uint8_t *tile_data_start = dst; - - write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header, fh_info, - largest_tile_id, &max_tile_size, &obu_header_size, - &tile_data_start); - + const int num_workers = cpi->mt_info.num_mod_workers[MOD_PACK_BS]; const int tile_cols = tiles->cols; const int tile_rows = tiles->rows; - const int have_tiles = tile_cols * tile_rows > 1; + const int num_tiles = tile_rows * tile_cols; + + const int enable_mt = + enable_pack_bitstream_mt(cpi->tile_data, num_tiles, num_workers); - if (have_tiles) + if (enable_mt) { + av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header, + fh_info, largest_tile_id, &max_tile_size, + &obu_header_size, &tile_data_start); + } else { + write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header, + fh_info, largest_tile_id, &max_tile_size, &obu_header_size, + &tile_data_start); + } + + if (num_tiles > 1) write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size, max_tile_size, obu_header_size, tile_data_start); return total_size; @@ -3887,6 +3975,9 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, const CommonTileParams *const tiles = &cm->tiles; *largest_tile_id = 0; + // Select the coding strategy (temporal or spatial) + if (cm->seg.enabled) av1_choose_segmap_coding_method(cm, &cpi->td.mb.e_mbd); + if (tiles->large_scale) return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb, largest_tile_id); @@ -3926,18 +4017,20 @@ static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) { (cm->current_frame.frame_type != KEY_FRAME && current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) || current_metadata->insert_flag == AOM_MIF_ANY_FRAME) { - obu_header_size = - av1_write_obu_header(&cpi->level_params, OBU_METADATA, 0, dst); + obu_header_size = av1_write_obu_header(&cpi->ppi->level_params, + &cpi->frame_header_count, + OBU_METADATA, 0, dst); obu_payload_size = av1_write_metadata_obu(current_metadata, dst + obu_header_size); - length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst); + length_field_size = + av1_obu_memmove(obu_header_size, obu_payload_size, dst); if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) == AOM_CODEC_OK) { const size_t obu_size = obu_header_size + obu_payload_size; dst += obu_size + length_field_size; total_bytes_written += obu_size + length_field_size; } else { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + aom_internal_error(cpi->common.error, AOM_CODEC_ERROR, "Error writing metadata OBU size"); } } @@ -3951,7 +4044,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, uint8_t *data = dst; uint32_t data_size; AV1_COMMON *const cm = &cpi->common; - AV1LevelParams *const level_params = &cpi->level_params; + AV1LevelParams *const level_params = &cpi->ppi->level_params; uint32_t obu_header_size = 0; uint32_t obu_payload_size = 0; FrameHeaderInfo fh_info = { NULL, 0, 0 }; @@ -3967,19 +4060,19 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, bitstream_queue_reset_write(); #endif - level_params->frame_header_count = 0; + cpi->frame_header_count = 0; // The TD is now written outside the frame encode loop // write sequence header obu if KEY_FRAME, preceded by 4-byte size if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) { - obu_header_size = - av1_write_obu_header(level_params, OBU_SEQUENCE_HEADER, 0, data); + obu_header_size = av1_write_obu_header( + level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data); obu_payload_size = - av1_write_sequence_header_obu(&cm->seq_params, data + obu_header_size); + av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size); const size_t length_field_size = - obu_memmove(obu_header_size, obu_payload_size, data); + av1_obu_memmove(obu_header_size, obu_payload_size, data); if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; @@ -3998,12 +4091,13 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, if (write_frame_header) { // Write Frame Header OBU. fh_info.frame_header = data; - obu_header_size = av1_write_obu_header(level_params, OBU_FRAME_HEADER, - obu_extension_header, data); - obu_payload_size = - write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1); + obu_header_size = + av1_write_obu_header(level_params, &cpi->frame_header_count, + OBU_FRAME_HEADER, obu_extension_header, data); + obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb, + data + obu_header_size, 1); - length_field = obu_memmove(obu_header_size, obu_payload_size, data); + length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data); if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) != AOM_CODEC_OK) { return AOM_CODEC_ERROR; diff --git a/third_party/libaom/source/libaom/av1/encoder/bitstream.h b/third_party/libaom/source/libaom/av1/encoder/bitstream.h index df35ecccfa..e32cd3bd19 100644 --- a/third_party/libaom/source/libaom/av1/encoder/bitstream.h +++ b/third_party/libaom/source/libaom/av1/encoder/bitstream.h @@ -16,9 +16,67 @@ extern "C" { #endif -#include "av1/encoder/encoder.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" +#include "av1/encoder/level.h" +#include "aom_dsp/bitwriter.h" struct aom_write_bit_buffer; +struct AV1_COMP; +struct ThreadData; + +/*!\cond */ + +// Stores the location and size of a tile's data in the bitstream. Used for +// later identifying identical tiles +typedef struct { + uint8_t *data; + size_t size; +} TileBufferEnc; + +typedef struct { + uint8_t *frame_header; + size_t obu_header_byte_offset; + size_t total_length; +} FrameHeaderInfo; + +typedef struct { + struct aom_write_bit_buffer *saved_wb; // Bit stream buffer writer structure + TileBufferEnc buf; // Structure to hold bitstream buffer and size + uint32_t *total_size; // Size of the bitstream buffer for the tile in bytes + uint8_t *dst; // Base address of tile bitstream buffer + uint8_t *tile_data_curr; // Base address of tile-group bitstream buffer + size_t tile_buf_size; // Available bitstream buffer for the tile in bytes + uint8_t obu_extn_header; // Presence of OBU extension header + uint32_t obu_header_size; // Size of the OBU header + int curr_tg_hdr_size; // Size of the obu, tg, frame headers + int tile_size_mi; // Tile size in mi units + int tile_row; // Number of tile rows + int tile_col; // Number of tile columns + int is_last_tile_in_tg; // Flag to indicate last tile in a tile-group + int new_tg; // Flag to indicate starting of a new tile-group +} PackBSParams; + +typedef struct { + uint64_t abs_sum_level; + uint16_t tile_idx; +} PackBSTileOrder; + +// Pack bitstream data for pack bitstream multi-threading. +typedef struct { +#if CONFIG_MULTITHREAD + // Mutex lock used while dispatching jobs. + pthread_mutex_t *mutex_; +#endif + // Tile order structure of pack bitstream multithreading. + PackBSTileOrder pack_bs_tile_order[MAX_TILES]; + + // Index of next job to be processed. + int next_job_idx; +} AV1EncPackBSSync; + +/*!\endcond */ // Writes only the OBU Sequence Header payload, and returns the size of the // payload written to 'dst'. This function does not write the OBU header, the @@ -29,23 +87,44 @@ uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params, // Writes the OBU header byte, and the OBU header extension byte when // 'obu_extension' is non-zero. Returns number of bytes written to 'dst'. uint32_t av1_write_obu_header(AV1LevelParams *const level_params, - OBU_TYPE obu_type, int obu_extension, - uint8_t *const dst); + int *frame_header_count, OBU_TYPE obu_type, + int obu_extension, uint8_t *const dst); int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size, uint8_t *dest); +// Pack tile data in the bitstream with tile_group, frame +// and OBU header. +void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td, + PackBSParams *const pack_bs_params); + +void av1_write_last_tile_info( + struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info, + struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size, + uint8_t *curr_tg_start, uint32_t *const total_size, + uint8_t **tile_data_start, int *const largest_tile_id, + int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header); + /*!\brief Pack the bitstream for one frame * * \ingroup high_level_algo * \callgraph */ -int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, +int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size, int *const largest_tile_id); void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w); +void av1_reset_pack_bs_thread_data(struct ThreadData *const td); + +void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi, + struct ThreadData const *td); + +void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi, + MACROBLOCKD *const xd, + PackBSParams *const pack_bs_params, + const int tile_idx); #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/libaom/source/libaom/av1/encoder/block.h b/third_party/libaom/source/libaom/av1/encoder/block.h index 59353cfac3..aaf3654a5f 100644 --- a/third_party/libaom/source/libaom/av1/encoder/block.h +++ b/third_party/libaom/source/libaom/av1/encoder/block.h @@ -102,7 +102,7 @@ typedef struct { */ typedef struct macroblock_plane { //! Stores source - pred so the txfm can be computed later - DECLARE_ALIGNED(32, int16_t, src_diff[MAX_SB_SQUARE]); + int16_t *src_diff; //! Dequantized coefficients tran_low_t *dqcoeff; //! Quantized coefficients @@ -778,6 +778,23 @@ typedef struct { /**@}*/ } MvCosts; +/*! \brief Holds mv costs for intrabc. + */ +typedef struct { + /*! Costs for coding the joint mv. */ + int joint_mv[MV_JOINTS]; + + /*! \brief Cost of transmitting the actual motion vector. + * dv_costs_alloc[0][i] is the cost of motion vector with horizontal + * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of + * motion vector with vertical component (mv_col) equal to i - MV_MAX. + */ + int dv_costs_alloc[2][MV_VALS]; + + /*! Points to the middle of \ref dv_costs_alloc. */ + int *dv_costs[2]; +} IntraBCMVCosts; + /*! \brief Holds the costs needed to encode the coefficients */ typedef struct { @@ -817,6 +834,14 @@ typedef struct { int lighting_change; int low_sumdiff; } CONTENT_STATE_SB; + +// Structure to hold pixel level gradient info. +typedef struct { + uint16_t abs_dx_abs_dy_sum; + int8_t hist_bin_idx; + bool is_dx_zero; +} PixelLevelGradientInfo; + /*!\endcond */ /*! \brief Encoder's parameters related to the current coding block. @@ -945,6 +970,11 @@ typedef struct macroblock { //! multipliers for motion search. MvCosts *mv_costs; + /*! The rate needed to encode a new motion vector to the bitstream in intrabc + * mode. + */ + IntraBCMVCosts *dv_costs; + //! The rate needed to signal the txfm coefficients to the bitstream. CoeffCosts coeff_costs; /**@}*/ @@ -1014,6 +1044,10 @@ typedef struct macroblock { int pred_mv_sad[REF_FRAMES]; //! The minimum of \ref pred_mv_sad. int best_pred_mv_sad; + //! The sad of the 1st mv ref (nearest). + int pred_mv0_sad[REF_FRAMES]; + //! The sad of the 2nd mv ref (near). + int pred_mv1_sad[REF_FRAMES]; /*! \brief Disables certain ref frame pruning based on tpl. * @@ -1092,8 +1126,7 @@ typedef struct macroblock { * In the second pass, we retry the winner modes with more thorough txfm * options. */ - WinnerModeStats winner_mode_stats[AOMMAX(MAX_WINNER_MODE_COUNT_INTRA, - MAX_WINNER_MODE_COUNT_INTER)]; + WinnerModeStats *winner_mode_stats; //! Tracks how many winner modes there are. int winner_mode_count; @@ -1147,10 +1180,20 @@ typedef struct macroblock { */ IntraBCHashInfo intrabc_hash_info; - /*! \brief Whether to reuse the mode stored in intermode_cache. */ - int use_intermode_cache; - /*! \brief The mode to reuse during \ref av1_rd_pick_inter_mode. */ - const MB_MODE_INFO *intermode_cache; + /*! \brief Whether to reuse the mode stored in mb_mode_cache. */ + int use_mb_mode_cache; + /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and + * \ref av1_rd_pick_inter_mode. */ + const MB_MODE_INFO *mb_mode_cache; + /*! \brief Pointer to the buffer which caches gradient information. + * + * Pointer to the array of structures to store gradient information of each + * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level + * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV). + */ + PixelLevelGradientInfo *pixel_gradient_info; + /*! \brief Flags indicating the availability of cached gradient info. */ + bool is_sb_gradient_cached[PLANE_TYPES]; /**@}*/ /***************************************************************************** @@ -1195,6 +1238,8 @@ typedef struct macroblock { * Used in REALTIME coding mode to enhance the visual quality at the boundary * of moving color objects. */ + uint8_t color_sensitivity_sb[2]; + //! Color sensitivity flag for the coding block. uint8_t color_sensitivity[2]; /**@}*/ diff --git a/third_party/libaom/source/libaom/av1/encoder/compound_type.c b/third_party/libaom/source/libaom/av1/encoder/compound_type.c index aacb7fc88a..00fa3890bf 100644 --- a/third_party/libaom/source/libaom/av1/encoder/compound_type.c +++ b/third_party/libaom/source/libaom/av1/encoder/compound_type.c @@ -48,31 +48,31 @@ static INLINE int is_comp_rd_match(const AV1_COMP *const cpi, if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0; } - // Store the stats for COMPOUND_AVERAGE and COMPOUND_DISTWTD - for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD; - comp_type++) { - comp_rate[comp_type] = st->rate[comp_type]; - comp_dist[comp_type] = st->dist[comp_type]; - comp_model_rate[comp_type] = st->model_rate[comp_type]; - comp_model_dist[comp_type] = st->model_dist[comp_type]; - comp_rs2[comp_type] = st->comp_rs2[comp_type]; - } - - // For compound wedge/segment, reuse data only if NEWMV is not present in - // either of the directions + int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 }; + // For compound wedge, reuse data if newmv search is disabled when NEWMV is + // present or if NEWMV is not present in either of the directions if ((!have_newmv_in_inter_mode(mi->mode) && !have_newmv_in_inter_mode(st->mode)) || - (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) { - memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE], - sizeof(comp_rate[COMPOUND_WEDGE]) * 2); - memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE], - sizeof(comp_dist[COMPOUND_WEDGE]) * 2); - memcpy(&comp_model_rate[COMPOUND_WEDGE], &st->model_rate[COMPOUND_WEDGE], - sizeof(comp_model_rate[COMPOUND_WEDGE]) * 2); - memcpy(&comp_model_dist[COMPOUND_WEDGE], &st->model_dist[COMPOUND_WEDGE], - sizeof(comp_model_dist[COMPOUND_WEDGE]) * 2); - memcpy(&comp_rs2[COMPOUND_WEDGE], &st->comp_rs2[COMPOUND_WEDGE], - sizeof(comp_rs2[COMPOUND_WEDGE]) * 2); + (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) + reuse_data[COMPOUND_WEDGE] = 1; + // For compound diffwtd, reuse data if fast search is enabled (no newmv search + // when NEWMV is present) or if NEWMV is not present in either of the + // directions + if (cpi->sf.inter_sf.enable_fast_compound_mode_search || + (!have_newmv_in_inter_mode(mi->mode) && + !have_newmv_in_inter_mode(st->mode))) + reuse_data[COMPOUND_DIFFWTD] = 1; + + // Store the stats for the different compound types + for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES; + comp_type++) { + if (reuse_data[comp_type]) { + comp_rate[comp_type] = st->rate[comp_type]; + comp_dist[comp_type] = st->dist[comp_type]; + comp_model_rate[comp_type] = st->model_rate[comp_type]; + comp_model_dist[comp_type] = st->model_dist[comp_type]; + comp_rs2[comp_type] = st->comp_rs2[comp_type]; + } } return 1; } @@ -166,14 +166,14 @@ static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants) // for all codebooks; experiment with other quadrant combinations for // 0, 90 and 135 degrees also. - cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]); - cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, - pred0 + bh_by2 * stride0 + bw_by2, stride0, - &esq[0][1]); - cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]); - cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, - pred1 + bh_by2 * stride1 + bw_by2, stride0, - &esq[1][1]); + cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]); + cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, + pred0 + bh_by2 * stride0 + bw_by2, stride0, + &esq[0][1]); + cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]); + cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride, + pred1 + bh_by2 * stride1 + bw_by2, stride0, + &esq[1][1]); tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]); br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]); @@ -314,7 +314,7 @@ static int64_t pick_interinter_wedge( int8_t wedge_sign = 0; assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); - assert(cpi->common.seq_params.enable_masked_compound); + assert(cpi->common.seq_params->enable_masked_compound); if (cpi->sf.inter_sf.fast_wedge_sign_estimate) { wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw); @@ -392,7 +392,7 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, const MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(av1_is_wedge_used(bsize)); - assert(cpi->common.seq_params.enable_interintra_compound); + assert(cpi->common.seq_params->enable_interintra_compound); const struct buf_2d *const src = &x->plane[0].src; const int bw = block_size_wide[bsize]; @@ -836,7 +836,7 @@ static INLINE int compute_valid_comp_types(MACROBLOCK *x, const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE)); const int try_distwtd_comp = ((mode_search_mask & (1 << COMPOUND_DISTWTD)) && - cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 && + cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 && cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases @@ -1058,10 +1058,12 @@ static int64_t masked_compound_type_rd( if (compound_type == COMPOUND_WEDGE) { unsigned int sse; if (is_cur_buf_hbd(xd)) - (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides, - CONVERT_TO_BYTEPTR(*preds1), *strides, &sse); + (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides, + CONVERT_TO_BYTEPTR(*preds1), *strides, + &sse); else - (void)cpi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse); + (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, + &sse); const unsigned int mse = ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]); // If two predictors are very similar, skip wedge compound mode search @@ -1164,7 +1166,8 @@ static int64_t masked_compound_type_rd( assert(comp_dist[compound_type] != INT64_MAX); // When disable_interinter_wedge_newmv_search is set, motion refinement is // disabled. Hence rate and distortion can be reused in this case as well - assert(IMPLIES(have_newmv_in_inter_mode(this_mode), + assert(IMPLIES((have_newmv_in_inter_mode(this_mode) && + (compound_type == COMPOUND_WEDGE)), cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)); assert(mbmi->mv[0].as_int == cur_mv[0].as_int); assert(mbmi->mv[1].as_int == cur_mv[1].as_int); @@ -1338,11 +1341,12 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, if (have_newmv_in_inter_mode(this_mode)) { InterPredParams inter_pred_params; av1_dist_wtd_comp_weight_assign( - &cpi->common, mbmi, 0, &inter_pred_params.conv_params.fwd_offset, + &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset, &inter_pred_params.conv_params.bck_offset, &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1); int mask_value = inter_pred_params.conv_params.fwd_offset * 4; - memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask)); + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode); } @@ -1369,7 +1373,7 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] }; int best_rs2 = 0; int best_rate_mv = *rate_mv; - const int wedge_mask_size = get_wedge_types_lookup(bsize); + int wedge_mask_size = get_wedge_types_lookup(bsize); int need_mask_search = args->wedge_index == -1; if (need_mask_search && !have_newmv_in_inter_mode(this_mode)) { @@ -1392,7 +1396,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); if (mode_rd >= ref_best_rd / 2) continue; - if (have_newmv_in_inter_mode(this_mode)) { + if (have_newmv_in_inter_mode(this_mode) && + !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search) { tmp_rate_mv = av1_interinter_compound_motion_search( cpi, x, cur_mv, bsize, this_mode); av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, @@ -1425,6 +1430,33 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, best_rs2 = rs2; } } + // Consider the asymmetric partitions for oblique angle only if the + // corresponding symmetric partition is the best so far. + // Note: For horizontal and vertical types, both symmetric and + // asymmetric partitions are always considered. + if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) { + // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16] + // correspond to symmetric partitions of the 4 oblique angles, the + // next 4 entries correspond to the vertical/horizontal + // symmetric/asymmetric partitions and the last 8 entries correspond + // to the asymmetric partitions of oblique types. + const int idx_before_asym_oblique = 7; + const int last_oblique_sym_idx = 3; + if (wedge_mask == idx_before_asym_oblique) { + if (best_mask_index > last_oblique_sym_idx) { + break; + } else { + // Asymmetric (Index-1) map for the corresponding oblique masks. + // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9 + // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13 + // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15 + // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11 + const int asym_mask_idx[4] = { 7, 11, 13, 9 }; + wedge_mask = asym_mask_idx[best_mask_index]; + wedge_mask_size = wedge_mask + 3; + } + } + } } if (need_mask_search) { @@ -1439,7 +1471,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, rs2 = masked_type_cost[cur_type]; rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); - if (have_newmv_in_inter_mode(this_mode)) { + if (have_newmv_in_inter_mode(this_mode) && + !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search) { tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode); } @@ -1485,7 +1518,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, if (have_newmv_in_inter_mode(this_mode)) { // hard coded number for diff wtd int mask_value = mask_index == 0 ? 38 : 26; - memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask)); + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, bsize, this_mode); } @@ -1522,7 +1556,8 @@ int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi); int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26; - memset(xd->seg_mask, mask_value, sizeof(xd->seg_mask)); + memset(xd->seg_mask, mask_value, + sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE); if (have_newmv_in_inter_mode(this_mode)) { tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv, diff --git a/third_party/libaom/source/libaom/av1/encoder/context_tree.c b/third_party/libaom/source/libaom/av1/encoder/context_tree.c index 566576e4f5..9fd9d1b1e8 100644 --- a/third_party/libaom/source/libaom/av1/encoder/context_tree.c +++ b/third_party/libaom/source/libaom/av1/encoder/context_tree.c @@ -230,7 +230,7 @@ static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128, void av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) { AV1_COMMON *const cm = &cpi->common; const int stat_generation_stage = is_stat_generation_stage(cpi); - const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128; + const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128; const int tree_nodes = get_pc_tree_nodes(is_sb_size_128, stat_generation_stage); int sms_tree_index = 0; diff --git a/third_party/libaom/source/libaom/av1/encoder/dwt.c b/third_party/libaom/source/libaom/av1/encoder/dwt.c index b5ed4a3446..5dfbcb677b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/dwt.c +++ b/third_party/libaom/source/libaom/av1/encoder/dwt.c @@ -147,9 +147,23 @@ uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) { return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh)); } -int av1_haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, int hbd) { +static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, + int hbd) { tran_low_t output[64]; av1_fdwt8x8_uint8_input_c(input, output, stride, hbd); return av1_haar_ac_sad(output, 8, 8, 8); } + +int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride, + int hbd, int num_8x8_rows, + int num_8x8_cols) { + int64_t wavelet_energy = 0; + for (int r8 = 0; r8 < num_8x8_rows; ++r8) { + for (int c8 = 0; c8 < num_8x8_cols; ++c8) { + wavelet_energy += haar_ac_sad_8x8_uint8_input( + input + c8 * 8 + r8 * 8 * stride, stride, hbd); + } + } + return wavelet_energy; +} diff --git a/third_party/libaom/source/libaom/av1/encoder/dwt.h b/third_party/libaom/source/libaom/av1/encoder/dwt.h index 1bd32edb3b..443b6bc12c 100644 --- a/third_party/libaom/source/libaom/av1/encoder/dwt.h +++ b/third_party/libaom/source/libaom/av1/encoder/dwt.h @@ -19,6 +19,9 @@ void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output, int stride, int hbd); -int av1_haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, int hbd); + +int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride, + int hbd, int num_8x8_rows, + int num_8x8_cols); #endif // AOM_AV1_ENCODER_DWT_H_ diff --git a/third_party/libaom/source/libaom/av1/encoder/enc_enums.h b/third_party/libaom/source/libaom/av1/encoder/enc_enums.h index 319e5d02c9..20cefa16a5 100644 --- a/third_party/libaom/source/libaom/av1/encoder/enc_enums.h +++ b/third_party/libaom/source/libaom/av1/encoder/enc_enums.h @@ -216,6 +216,8 @@ enum { NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START, THR_MODE_START = THR_NEARESTMV, THR_MODE_END = MAX_MODES, + THR_INTER_MODE_START = THR_MODE_START, + THR_INTER_MODE_END = THR_DC, THR_INVALID = 255 } UENUM1BYTE(THR_MODES); diff --git a/third_party/libaom/source/libaom/av1/encoder/encode_strategy.c b/third_party/libaom/source/libaom/av1/encoder/encode_strategy.c index da7ec4487d..01f2959d85 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encode_strategy.c +++ b/third_party/libaom/source/libaom/av1/encoder/encode_strategy.c @@ -106,11 +106,19 @@ void av1_configure_buffer_updates( } if (ext_refresh_frame_flags->update_pending && - (!is_stat_generation_stage(cpi))) + (!is_stat_generation_stage(cpi))) { set_refresh_frame_flags(refresh_frame_flags, ext_refresh_frame_flags->golden_frame, ext_refresh_frame_flags->bwd_ref_frame, ext_refresh_frame_flags->alt_ref_frame); + GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (ext_refresh_frame_flags->golden_frame) + gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE; + if (ext_refresh_frame_flags->alt_ref_frame) + gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE; + if (ext_refresh_frame_flags->bwd_ref_frame) + gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE; + } if (force_refresh_all) set_refresh_frame_flags(refresh_frame_flags, true, true, true); @@ -141,7 +149,7 @@ static INLINE int is_frame_droppable( const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) { // Droppable frame is only used by external refresh flags. VoD setting won't // trigger its use case. - if (svc->external_ref_frame_config) + if (svc->set_ref_frame_config) return svc->non_reference_frame; else if (ext_refresh_frame_flags->update_pending) return !(ext_refresh_frame_flags->alt_ref_frame || @@ -168,7 +176,7 @@ static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) { static INLINE void update_gf_group_index(AV1_COMP *cpi) { // Increment the gf group index ready for the next frame. - ++cpi->gf_group.index; + ++cpi->gf_frame_index; } static void update_rc_counts(AV1_COMP *cpi) { @@ -216,7 +224,7 @@ static int get_current_frame_ref_type( // TODO(jingning): This table should be a lot simpler with the new // ARF system in place. Keep frame_params for the time being as we are // still evaluating a few design options. - switch (cpi->gf_group.layer_depth[cpi->gf_group.index]) { + switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) { case 0: return 0; case 1: return 1; case MAX_ARF_LAYERS: @@ -238,16 +246,16 @@ static int choose_primary_ref_frame( // In large scale case, always use Last frame's frame contexts. // Note(yunqing): In other cases, primary_ref_frame is chosen based on - // cpi->gf_group.layer_depth[cpi->gf_group.index], which also controls + // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls // frame bit allocation. if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME); - if (cpi->use_svc) return av1_svc_primary_ref_frame(cpi); + if (cpi->ppi->use_svc) return av1_svc_primary_ref_frame(cpi); // Find the most recent reference frame with the same reference type as the // current frame const int current_ref_type = get_current_frame_ref_type(cpi, frame_params); - int wanted_fb = cpi->fb_of_context_type[current_ref_type]; + int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type]; int primary_ref_frame = PRIMARY_REF_NONE; for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { @@ -303,7 +311,7 @@ static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) { // Clear down mmx registers aom_clear_system_state(); - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { + if (cpi->ppi->use_svc && cpi->svc.spatial_layer_id > 0) { cpi->framerate = cpi->svc.base_framerate; av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); return; @@ -372,17 +380,17 @@ static struct lookahead_entry *choose_frame_source( struct lookahead_entry **last_source, EncodeFrameParams *const frame_params) { AV1_COMMON *const cm = &cpi->common; - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; struct lookahead_entry *source = NULL; // Source index in lookahead buffer. - int src_index = gf_group->arf_src_offset[gf_group->index]; + int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q if (src_index && (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index, cpi->compressor_stage) != -1) && - cpi->oxcf.rc_cfg.mode != AOM_Q) { + cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) { src_index = 0; *flush = 1; } @@ -395,7 +403,7 @@ static struct lookahead_entry *choose_frame_source( // If this is a key frame and keyframe filtering is enabled with overlay, // then do not pop. if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 && - gf_group->update_type[gf_group->index] == ARF_UPDATE && + gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) { if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz && (*flush || @@ -404,16 +412,37 @@ static struct lookahead_entry *choose_frame_source( *pop_lookahead = 0; } } + + // LAP stage does not have ARFs or forward key-frames, + // hence, always pop_lookahead here. + if (is_stat_generation_stage(cpi)) { + *pop_lookahead = 1; + src_index = 0; + } + frame_params->show_frame = *pop_lookahead; - if (*pop_lookahead) { + +#if CONFIG_FRAME_PARALLEL_ENCODE + // Future frame in parallel encode set + if (gf_group->src_offset[cpi->gf_frame_index] != 0 && + !is_stat_generation_stage(cpi) && + 0 /*will be turned on along with frame parallel encode*/) { + src_index = gf_group->src_offset[cpi->gf_frame_index]; + // Don't remove future frames from lookahead_ctx. They will be + // removed in their actual encode call. + *pop_lookahead = 0; + } +#endif + if (frame_params->show_frame) { // show frame, pop from buffer // Get last frame source. if (cm->current_frame.frame_number > 0) { - *last_source = - av1_lookahead_peek(cpi->ppi->lookahead, -1, cpi->compressor_stage); + *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1, + cpi->compressor_stage); } // Read in the source frame. - source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage); + source = av1_lookahead_peek(cpi->ppi->lookahead, src_index, + cpi->compressor_stage); } else { // no show frames are arf frames source = av1_lookahead_peek(cpi->ppi->lookahead, src_index, @@ -677,7 +706,17 @@ void av1_update_ref_frame_map(AV1_COMP *cpi, return; } -static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) { +static int get_free_ref_map_index( +#if CONFIG_FRAME_PARALLEL_ENCODE + RefFrameMapPair ref_map_pairs[REF_FRAMES], +#endif // CONFIG_FRAME_PARALLEL_ENCODE + const RefBufferStack *ref_buffer_stack) { +#if CONFIG_FRAME_PARALLEL_ENCODE + (void)ref_buffer_stack; + for (int idx = 0; idx < REF_FRAMES; ++idx) + if (ref_map_pairs[idx].disp_order == -1) return idx; + return INVALID_IDX; +#else for (int idx = 0; idx < REF_FRAMES; ++idx) { int is_free = 1; for (int i = 0; i < ref_buffer_stack->arf_stack_size; ++i) { @@ -704,11 +743,61 @@ static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) { if (is_free) return idx; } return INVALID_IDX; +#endif // CONFIG_FRAME_PARALLEL_ENCODE } +#if CONFIG_FRAME_PARALLEL_ENCODE +static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int update_arf, int cur_frame_disp) { + int arf_count = 0; + int oldest_arf_order = INT32_MAX; + int oldest_arf_idx = -1; + + int oldest_frame_order = INT32_MAX; + int oldest_idx = -1; + + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx]; + if (ref_pair.disp_order == -1) continue; + const int frame_order = ref_pair.disp_order; + const int reference_frame_level = ref_pair.pyr_level; + // Do not refresh a future frame. + if (frame_order > cur_frame_disp) continue; + + // Keep track of the oldest level 1 frame if the current frame is also level + // 1. + if (reference_frame_level == 1) { + // If there are more than 2 level 1 frames in the reference list, + // discard the oldest. + if (frame_order < oldest_arf_order) { + oldest_arf_order = frame_order; + oldest_arf_idx = map_idx; + } + arf_count++; + continue; + } + + // Update the overall oldest reference frame. + if (frame_order < oldest_frame_order) { + oldest_frame_order = frame_order; + oldest_idx = map_idx; + } + } + if (update_arf && arf_count > 2) return oldest_arf_idx; + if (oldest_idx >= 0) return oldest_idx; + if (oldest_arf_idx >= 0) return oldest_arf_idx; + assert(0 && "No valid refresh index found"); + return -1; +} +#endif // CONFIG_FRAME_PARALLEL_ENCODE + int av1_get_refresh_frame_flags(const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, FRAME_UPDATE_TYPE frame_update_type, +#if CONFIG_FRAME_PARALLEL_ENCODE + int cur_disp_order, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], +#endif // CONFIG_FRAME_PARALLEL_ENCODE const RefBufferStack *const ref_buffer_stack) { const AV1_COMMON *const cm = &cpi->common; const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags = @@ -733,7 +822,7 @@ int av1_get_refresh_frame_flags(const AV1_COMP *const cpi, int refresh_mask = 0; if (ext_refresh_frame_flags->update_pending) { - if (svc->external_ref_frame_config) { + if (svc->set_ref_frame_config) { for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { int ref_frame_map_idx = svc->ref_idx[i]; refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx; @@ -777,7 +866,30 @@ int av1_get_refresh_frame_flags(const AV1_COMP *const cpi, } // Search for the open slot to store the current frame. - int free_fb_index = get_free_ref_map_index(ref_buffer_stack); + int free_fb_index = get_free_ref_map_index( +#if CONFIG_FRAME_PARALLEL_ENCODE + ref_frame_map_pairs, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + ref_buffer_stack); + +#if CONFIG_FRAME_PARALLEL_ENCODE + // No refresh necessary for these frame types. + if (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE) + return refresh_mask; + + // If there is an open slot, refresh that one instead of replacing a + // reference. + if (free_fb_index != INVALID_IDX) { + refresh_mask = 1 << free_fb_index; + return refresh_mask; + } + + const int update_arf = frame_update_type == ARF_UPDATE; + const int refresh_idx = + get_refresh_idx(ref_frame_map_pairs, update_arf, cur_disp_order); + return 1 << refresh_idx; +#else switch (frame_update_type) { case KF_UPDATE: case GF_UPDATE: @@ -843,6 +955,7 @@ int av1_get_refresh_frame_flags(const AV1_COMP *const cpi, } return refresh_mask; +#endif // CONFIG_FRAME_PARALLEL_ENCODE } #if !CONFIG_REALTIME_ONLY @@ -852,10 +965,10 @@ void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) { MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; - av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params.sb_size); + av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size); - av1_setup_block_planes(xd, cm->seq_params.subsampling_x, - cm->seq_params.subsampling_y, num_planes); + av1_setup_block_planes(xd, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, num_planes); set_mi_offsets(&cm->mi_params, xd, 0, 0); } @@ -872,8 +985,9 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, #endif const AV1EncoderConfig *const oxcf = &cpi->oxcf; AV1_COMMON *const cm = &cpi->common; - const GF_GROUP *const gf_group = &cpi->gf_group; - FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); // Decide whether to apply temporal filtering to the source frame. int apply_filtering = 0; @@ -887,7 +1001,7 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1; if (allow_kf_filtering) { const double y_noise_level = av1_estimate_noise_from_single_plane( - frame_input->source, 0, cm->seq_params.bit_depth); + frame_input->source, 0, cm->seq_params->bit_depth); apply_filtering = y_noise_level > 0; } else { apply_filtering = 0; @@ -900,6 +1014,9 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, // ARF apply_filtering = oxcf->algo_cfg.arnr_max_frames > 0; } + if (is_stat_generation_stage(cpi)) { + apply_filtering = 0; + } #if CONFIG_COLLECT_COMPONENT_TIMING if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time); @@ -911,7 +1028,7 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, int show_existing_alt_ref = 0; // TODO(bohanli): figure out why we need frame_type in cm here. cm->current_frame.frame_type = frame_params->frame_type; - int arf_src_index = gf_group->arf_src_offset[gf_group->index]; + int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; int is_forward_keyframe = 0; if (!frame_params->show_frame && cpi->no_show_fwd_kf) { // TODO(angiebird): Figure out why this condition yields forward keyframe. @@ -922,8 +1039,8 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, av1_temporal_filter(cpi, arf_src_index, update_type, is_forward_keyframe, &show_existing_alt_ref); if (code_arf) { - aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm)); - frame_input->source = &cpi->alt_ref_buffer; + aom_extend_frame_borders(&cpi->ppi->alt_ref_buffer, av1_num_planes(cm)); + frame_input->source = &cpi->ppi->alt_ref_buffer; aom_copy_metadata_to_frame_buffer(frame_input->source, source_buffer->metadata); } @@ -944,12 +1061,12 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, // Don't do tpl for fwd key frames or fwd key frame overlays allow_tpl = allow_tpl && !cpi->sf.tpl_sf.disable_filtered_key_tpl && !cpi->no_show_fwd_kf && - gf_group->update_type[gf_group->index] != OVERLAY_UPDATE; + gf_group->update_type[cpi->gf_frame_index] != OVERLAY_UPDATE; } else { // Do tpl after ARF is filtered, or if no ARF, at the second frame of GF // group. // TODO(bohanli): if no ARF, just do it at the first frame. - int gf_index = gf_group->index; + int gf_index = cpi->gf_frame_index; allow_tpl = allow_tpl && (gf_group->update_type[gf_index] == ARF_UPDATE || gf_group->update_type[gf_index] == GF_UPDATE); if (allow_tpl) { @@ -962,10 +1079,13 @@ static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest, if (allow_tpl == 0) { // Avoid the use of unintended TPL stats from previous GOP's results. - if (gf_group->index == 0) av1_init_tpl_stats(&cpi->tpl_data); + if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi)) + av1_init_tpl_stats(&cpi->ppi->tpl_data); } else { - if (!cpi->tpl_data.skip_tpl_setup_stats) + if (!cpi->skip_tpl_setup_stats) { + av1_tpl_preload_rc_estimate(cpi, frame_params); av1_tpl_setup_stats(cpi, 0, frame_params, frame_input); + } } if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) != @@ -1003,12 +1123,262 @@ static INLINE int find_unused_ref_frame(const int *used_ref_frames, return INVALID_IDX; } -void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) { +#if CONFIG_FRAME_PARALLEL_ENCODE +/*!\cond */ +// Struct to keep track of relevant reference frame data. +typedef struct { + int map_idx; + int disp_order; + int pyr_level; + int used; +} RefBufMapData; +/*!\endcond */ + +// Comparison function to sort reference frames in ascending display order. +static int compare_map_idx_pair_asc(const void *a, const void *b) { + if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) { + return 0; + } else if (((const RefBufMapData *)a)->disp_order > + ((const RefBufMapData *)b)->disp_order) { + return 1; + } else { + return -1; + } +} + +// Checks to see if a particular reference frame is already in the reference +// frame map. +static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) { + for (int i = 0; i < n_frames; i++) { + if (disp_order == map[i].disp_order) return 1; + } + return 0; +} + +// Add a reference buffer index to a named reference slot. +static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx, + int frame) { + remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx; + ref->used = 1; +} + +// Threshold dictating when we are allowed to start considering +// leaving lowest level frames unmapped. +#define LOW_LEVEL_FRAMES_TR 5 + +// Find which reference buffer should be left out of the named mapping. +// This is because there are 8 reference buffers and only 7 named slots. +static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs, + int n_min_level_refs, int min_level, + int cur_frame_disp) { + int max_dist = 0; + int unmapped_idx = -1; + if (n_bufs <= ALTREF_FRAME) return; + for (int i = 0; i < n_bufs; i++) { + if (buffer_map[i].used) continue; + if (buffer_map[i].pyr_level != min_level || + n_min_level_refs >= LOW_LEVEL_FRAMES_TR) { + int dist = abs(cur_frame_disp - buffer_map[i].disp_order); + if (dist > max_dist) { + max_dist = dist; + unmapped_idx = i; + } + } + } + assert(unmapped_idx >= 0 && "Unmapped reference not found"); + buffer_map[unmapped_idx].used = 1; +} + +static void get_ref_frames(AV1_COMP *const cpi, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int cur_frame_disp) { AV1_COMMON *cm = &cpi->common; int *const remapped_ref_idx = cm->remapped_ref_idx; - int *const arf_stack = ref_buffer_stack->arf_stack; - int *const lst_stack = ref_buffer_stack->lst_stack; - int *const gld_stack = ref_buffer_stack->gld_stack; + + int buf_map_idx = 0; + + // Initialize reference frame mappings. + for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX; + + RefBufMapData buffer_map[REF_FRAMES]; + int n_bufs = 0; + memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0])); + int min_level = MAX_ARF_LAYERS; + int max_level = 0; + + // Go through current reference buffers and store display order, pyr level, + // and map index. + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + // Get reference frame buffer. + RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx]; + if (ref_pair.disp_order == -1) continue; + const int frame_order = ref_pair.disp_order; + // Avoid duplicates. + if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue; + const int reference_frame_level = ref_pair.pyr_level; + + // Keep track of the lowest and highest levels that currently exist. + if (reference_frame_level < min_level) min_level = reference_frame_level; + if (reference_frame_level > max_level) max_level = reference_frame_level; + + buffer_map[n_bufs].map_idx = map_idx; + buffer_map[n_bufs].disp_order = frame_order; + buffer_map[n_bufs].pyr_level = reference_frame_level; + buffer_map[n_bufs].used = 0; + n_bufs++; + } + + // Sort frames in ascending display order. + qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc); + + int n_min_level_refs = 0; + int n_past_high_level = 0; + int closest_past_ref = -1; + int golden_idx = -1; + int altref_idx = -1; + + // Find the GOLDEN_FRAME and BWDREF_FRAME. + // Also collect various stats about the reference frames for the remaining + // mappings. + for (int i = n_bufs - 1; i >= 0; i--) { + if (buffer_map[i].pyr_level == min_level) { + // Keep track of the number of lowest level frames. + n_min_level_refs++; + if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 && + remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) { + // Save index for GOLDEN. + golden_idx = i; + } else if (buffer_map[i].disp_order > cur_frame_disp && + altref_idx == -1 && + remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) { + // Save index for ALTREF. + altref_idx = i; + } + } else if (buffer_map[i].disp_order == cur_frame_disp) { + // Map the BWDREF_FRAME if this is the show_existing_frame. + add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME); + } + + // Keep track of the number of past frames that are not at the lowest level. + if (buffer_map[i].disp_order < cur_frame_disp && + buffer_map[i].pyr_level != min_level) + n_past_high_level++; + + // Keep track of where the frames change from being past frames to future + // frames. + if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0) + closest_past_ref = i; + } + + // Do not map GOLDEN and ALTREF based on their pyramid level if all reference + // frames have the same level. + if (n_min_level_refs <= n_bufs) { + // Map the GOLDEN_FRAME. + if (golden_idx > -1) + add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME); + // Map the ALTREF_FRAME. + if (altref_idx > -1) + add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME); + } + + // Find the buffer to be excluded from the mapping. + set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level, + cur_frame_disp); + + // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME. + for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer + // in decreasing ouptut order relative to current picture. + int next_buf_max = 0; + int next_disp_order = INT_MIN; + for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used && + buffer_map[buf_map_idx].disp_order < cur_frame_disp && + buffer_map[buf_map_idx].disp_order > next_disp_order) { + next_disp_order = buffer_map[buf_map_idx].disp_order; + next_buf_max = buf_map_idx; + } + } + buf_map_idx = next_buf_max; + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME. + for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer + // in increasing ouptut order relative to current picture. + int next_buf_max = 0; + int next_disp_order = INT_MAX; + for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used && + buffer_map[buf_map_idx].disp_order > cur_frame_disp && + buffer_map[buf_map_idx].disp_order < next_disp_order) { + next_disp_order = buffer_map[buf_map_idx].disp_order; + next_buf_max = buf_map_idx; + } + } + buf_map_idx = next_buf_max; + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place remaining past frames. + buf_map_idx = closest_past_ref; + for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer. + for (; buf_map_idx >= 0; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used) break; + } + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Place remaining future frames. + buf_map_idx = n_bufs - 1; + for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) { + // Continue if the current ref slot is already full. + if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue; + // Find the next unmapped reference buffer. + for (; buf_map_idx > closest_past_ref; buf_map_idx--) { + if (!buffer_map[buf_map_idx].used) break; + } + if (buf_map_idx < 0) break; + if (buffer_map[buf_map_idx].used) break; + add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame); + } + + // Fill any slots that are empty (should only happen for the first 7 frames). + for (int i = 0; i < REF_FRAMES; ++i) + if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0; +} +#endif // CONFIG_FRAME_PARALLEL_ENCODE + +void av1_get_ref_frames(const RefBufferStack *ref_buffer_stack, +#if CONFIG_FRAME_PARALLEL_ENCODE + AV1_COMP *cpi, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int cur_frame_disp, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + int remapped_ref_idx[REF_FRAMES]) { +#if CONFIG_FRAME_PARALLEL_ENCODE + (void)ref_buffer_stack; + (void)remapped_ref_idx; + get_ref_frames(cpi, ref_frame_map_pairs, cur_frame_disp); + return; +#else + const int *const arf_stack = ref_buffer_stack->arf_stack; + const int *const lst_stack = ref_buffer_stack->lst_stack; + const int *const gld_stack = ref_buffer_stack->gld_stack; const int arf_stack_size = ref_buffer_stack->arf_stack_size; const int lst_stack_size = ref_buffer_stack->lst_stack_size; const int gld_stack_size = ref_buffer_stack->gld_stack_size; @@ -1079,6 +1449,7 @@ void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) { remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0]; } } +#endif // CONFIG_FRAME_PARALLEL_ENCODE } int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, @@ -1088,7 +1459,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, int flush) { AV1EncoderConfig *const oxcf = &cpi->oxcf; AV1_COMMON *const cm = &cpi->common; - GF_GROUP *gf_group = &cpi->gf_group; + GF_GROUP *gf_group = &cpi->ppi->gf_group; ExternalFlags *const ext_flags = &cpi->ext_flags; GFConfig *const gf_cfg = &oxcf->gf_cfg; @@ -1112,9 +1483,9 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) { #if !CONFIG_REALTIME_ONLY - if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { + if (flush && oxcf->pass == 1 && !cpi->ppi->twopass.first_pass_done) { av1_end_first_pass(cpi); /* get last stats packet */ - cpi->twopass.first_pass_done = 1; + cpi->ppi->twopass.first_pass_done = 1; } #endif return -1; @@ -1128,11 +1499,9 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height); } - cpi->tpl_data.skip_tpl_setup_stats = 0; + cpi->skip_tpl_setup_stats = 0; #if !CONFIG_REALTIME_ONLY - const int use_one_pass_rt_params = has_no_stats_stage(cpi) && - oxcf->mode == REALTIME && - gf_cfg->lag_in_frames == 0; + const int use_one_pass_rt_params = is_one_pass_rt_params(cpi); if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) { #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, av1_get_second_pass_params_time); @@ -1148,19 +1517,19 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, // If this is a forward keyframe, mark as a show_existing_frame // TODO(bohanli): find a consistent condition for fwd keyframes if (oxcf->kf_cfg.fwd_kf_enabled && - gf_group->update_type[gf_group->index] == OVERLAY_UPDATE && + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE && cpi->rc.frames_to_key == 0) { frame_params.show_existing_frame = 1; } else { frame_params.show_existing_frame = (cpi->show_existing_alt_ref && - gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) || - gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE; + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) || + gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE; } frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags); // Reset show_existing_alt_ref decision to 0 after it is used. - if (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) { + if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) { cpi->show_existing_alt_ref = 0; } } else { @@ -1181,13 +1550,20 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, if (source == NULL) { // If no source was found, we can't encode a frame. #if !CONFIG_REALTIME_ONLY - if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { + if (flush && oxcf->pass == 1 && !cpi->ppi->twopass.first_pass_done) { av1_end_first_pass(cpi); /* get last stats packet */ - cpi->twopass.first_pass_done = 1; + cpi->ppi->twopass.first_pass_done = 1; } #endif return -1; } + +#if CONFIG_FRAME_PARALLEL_ENCODE + // reset src_offset to allow actual encode call for this frame to get its + // source. + gf_group->src_offset[cpi->gf_frame_index] = 0; +#endif + // Source may be changed if temporal filtered later. frame_input.source = &source->img; frame_input.last_source = last_source != NULL ? &last_source->img : NULL; @@ -1216,7 +1592,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, &cm->film_grain_params); } else { cm->cur_frame->film_grain_params_present = - cm->seq_params.film_grain_params_present; + cm->seq_params->film_grain_params_present; } // only one operating point supported now const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp); @@ -1226,19 +1602,20 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, #if CONFIG_REALTIME_ONLY av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags); - if (cpi->oxcf.speed >= 5 && cm->number_spatial_layers == 1 && - cm->number_temporal_layers == 1) - av1_set_reference_structure_one_pass_rt(cpi, gf_group->index == 0); + if (cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 && + cpi->ppi->number_temporal_layers == 1) + av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0); #else if (use_one_pass_rt_params) { av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags); - if (cpi->oxcf.speed >= 5 && cm->number_spatial_layers == 1 && - cm->number_temporal_layers == 1) - av1_set_reference_structure_one_pass_rt(cpi, gf_group->index == 0); + if (cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 && + cpi->ppi->number_temporal_layers == 1) + av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0); } #endif - FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group); + FRAME_UPDATE_TYPE frame_update_type = + get_frame_update_type(gf_group, cpi->gf_frame_index); if (frame_params.show_existing_frame && frame_params.frame_type != KEY_FRAME) { @@ -1302,9 +1679,21 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME]; const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME]; +#if CONFIG_FRAME_PARALLEL_ENCODE + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; + init_ref_map_pair(cpi, ref_frame_map_pairs); + const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index]; + const int cur_frame_disp = + cpi->common.current_frame.frame_number + order_offset; +#endif // CONFIG_FRAME_PARALLEL_ENCODE + if (!ext_flags->refresh_frame.update_pending) { - av1_get_ref_frames(cpi, &cpi->ref_buffer_stack); - } else if (cpi->svc.external_ref_frame_config) { + av1_get_ref_frames(&cpi->ref_buffer_stack, +#if CONFIG_FRAME_PARALLEL_ENCODE + cpi, ref_frame_map_pairs, cur_frame_disp, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + cm->remapped_ref_idx); + } else if (cpi->svc.set_ref_frame_config) { for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i]; } @@ -1319,19 +1708,54 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, frame_params.ref_frame_flags = get_ref_frame_flags( &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags); +#if CONFIG_FRAME_PARALLEL_ENCODE + // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE. + if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) { + frame_params.primary_ref_frame = PRIMARY_REF_NONE; + } else { + frame_params.primary_ref_frame = + choose_primary_ref_frame(cpi, &frame_params); + } +#else frame_params.primary_ref_frame = choose_primary_ref_frame(cpi, &frame_params); - frame_params.order_offset = gf_group->arf_src_offset[gf_group->index]; - - frame_params.refresh_frame_flags = av1_get_refresh_frame_flags( - cpi, &frame_params, frame_update_type, &cpi->ref_buffer_stack); - +#endif // CONFIG_FRAME_PARALLEL_ENCODE + + frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index]; + + frame_params.refresh_frame_flags = + av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, +#if CONFIG_FRAME_PARALLEL_ENCODE + cur_frame_disp, ref_frame_map_pairs, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + &cpi->ref_buffer_stack); + +#if CONFIG_FRAME_PARALLEL_ENCODE + // Make the frames marked as is_frame_non_ref to non-reference frames. + if (gf_group->is_frame_non_ref[cpi->gf_frame_index]) + frame_params.refresh_frame_flags = 0; +#endif // CONFIG_FRAME_PARALLEL_ENCODE + +#if CONFIG_FRAME_PARALLEL_ENCODE + frame_params.existing_fb_idx_to_show = INVALID_IDX; + // Find the frame buffer to show based on display order. + if (frame_params.show_existing_frame) { + for (int frame = 0; frame < REF_FRAMES; frame++) { + const RefCntBuffer *const buf = cm->ref_frame_map[frame]; + if (buf == NULL) continue; + const int frame_order = (int)buf->display_order_hint; + if (frame_order == cur_frame_disp) + frame_params.existing_fb_idx_to_show = frame; + } + } +#else frame_params.existing_fb_idx_to_show = frame_params.show_existing_frame ? (frame_update_type == INTNL_OVERLAY_UPDATE ? get_ref_frame_map_idx(cm, BWDREF_FRAME) : get_ref_frame_map_idx(cm, ALTREF_FRAME)) : INVALID_IDX; +#endif // CONFIG_FRAME_PARALLEL_ENCODE } // The way frame_params->remapped_ref_idx is setup is a placeholder. @@ -1351,6 +1775,12 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm; } +#if CONFIG_FRAME_PARALLEL_ENCODE + // Copy previous frame's largest MV component from ppi to cpi. + if (!is_stat_generation_stage(cpi) && cpi->do_frame_data_update) + cpi->mv_search_params.max_mv_magnitude = cpi->ppi->max_mv_magnitude; +#endif // CONFIG_FRAME_PARALLEL_ENCODE + #if CONFIG_REALTIME_ONLY if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) != AOM_CODEC_OK) { @@ -1369,10 +1799,17 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, } #endif // CONFIG_REALTIME_ONLY +#if CONFIG_FRAME_PARALLEL_ENCODE + // Store current frame's largest MV component in ppi. + if (!is_stat_generation_stage(cpi) && cpi->do_frame_data_update) + cpi->ppi->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude; +#endif + if (!is_stat_generation_stage(cpi)) { // First pass doesn't modify reference buffer assignment or produce frame // flags update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags); +#if !CONFIG_FRAME_PARALLEL_ENCODE if (!ext_flags->refresh_frame.update_pending) { int ref_map_index = av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags); @@ -1380,6 +1817,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, cm->show_existing_frame, ref_map_index, &cpi->ref_buffer_stack); } +#endif // CONFIG_FRAME_PARALLEL_ENCODE } #if !CONFIG_REALTIME_ONLY @@ -1408,7 +1846,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, } if (!is_stat_generation_stage(cpi)) { - update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type); + update_fb_of_context_type(cpi, &frame_params, cpi->ppi->fb_of_context_type); set_additional_frame_flags(cm, frame_flags); update_rc_counts(cpi); } @@ -1421,7 +1859,7 @@ int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, cpi->droppable = is_frame_droppable(&cpi->svc, &ext_flags->refresh_frame); } - if (cpi->use_svc) av1_save_layer_context(cpi); + if (cpi->ppi->use_svc) av1_save_layer_context(cpi); return AOM_CODEC_OK; } diff --git a/third_party/libaom/source/libaom/av1/encoder/encode_strategy.h b/third_party/libaom/source/libaom/av1/encoder/encode_strategy.h index 351e8a1328..c7b75c8430 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encode_strategy.h +++ b/third_party/libaom/source/libaom/av1/encoder/encode_strategy.h @@ -69,6 +69,10 @@ void av1_configure_buffer_updates( int av1_get_refresh_frame_flags(const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, FRAME_UPDATE_TYPE frame_update_type, +#if CONFIG_FRAME_PARALLEL_ENCODE + int cur_disp_order, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], +#endif // CONFIG_FRAME_PARALLEL_ENCODE const RefBufferStack *const ref_buffer_stack); int av1_get_refresh_ref_frame_map(int refresh_frame_flags); @@ -79,7 +83,25 @@ void av1_update_ref_frame_map(AV1_COMP *cpi, int ref_map_index, RefBufferStack *ref_buffer_stack); -void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack); +/*!\brief Obtain indices of reference frames from reference frame buffer stacks + * + * \callgraph + * \callergraph + * + * \param[in] ref_buffer_stack Data structure for reference frame buffer + * stacks. + * \param[out] remapped_ref_idx An array for storing indices of reference + * frames. The index is used to retrieve a + * reference frame buffer from ref_frame_map + * in AV1Common. + */ +void av1_get_ref_frames(const RefBufferStack *ref_buffer_stack, +#if CONFIG_FRAME_PARALLEL_ENCODE + AV1_COMP *cpi, + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES], + int cur_frame_disp, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + int remapped_ref_idx[REF_FRAMES]); int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, const int up_to_index, diff --git a/third_party/libaom/source/libaom/av1/encoder/encodeframe.c b/third_party/libaom/source/libaom/av1/encoder/encodeframe.c index 24d3488245..b3f836b481 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encodeframe.c +++ b/third_party/libaom/source/libaom/av1/encoder/encodeframe.c @@ -55,6 +55,7 @@ #include "av1/encoder/encodetxb.h" #include "av1/encoder/ethread.h" #include "av1/encoder/extend.h" +#include "av1/encoder/intra_mode_search_utils.h" #include "av1/encoder/ml.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/partition_strategy.h" @@ -150,7 +151,7 @@ unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi, BLOCK_SIZE bs) { unsigned int sse; const unsigned int var = - cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse); + cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse); return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } @@ -163,9 +164,9 @@ unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi, const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8, AV1_HIGH_VAR_OFFS_10, AV1_HIGH_VAR_OFFS_12 }; - var = - cpi->fn_ptr[bs].vf(ref->buf, ref->stride, - CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0, &sse); + var = cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride, + CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0, + &sse); return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } @@ -181,7 +182,8 @@ static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi, assert(last != NULL); last_y = &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE]; - var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse); + var = cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, + &sse); return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } @@ -242,7 +244,7 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td, const DeltaQInfo *const delta_q_info = &cm->delta_q_info; assert(delta_q_info->delta_q_present_flag); - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; // Delta-q modulation based on variance av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); @@ -307,7 +309,7 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td, (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER); const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; - const int mib_size = cm->seq_params.mib_size; + const int mib_size = cm->seq_params->mib_size; // pre-set the delta lf for loop filter. Note that this value is set // before mi is assigned for each block in current superblock @@ -326,22 +328,23 @@ static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td, static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row, int mi_col) { const AV1_COMMON *cm = &cpi->common; - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const CommonModeInfoParams *const mi_params = &cm->mi_params; MACROBLOCK *x = &td->mb; - const int frame_idx = cpi->gf_group.index; - TplParams *const tpl_data = &cpi->tpl_data; - TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + const int frame_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; av1_zero(x->tpl_keep_ref_frame); - if (tpl_frame->is_valid == 0) return; - if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return; if (frame_idx >= MAX_TPL_FRAME_IDX) return; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; + if (!tpl_frame->is_valid) return; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return; if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return; - const int is_overlay = cpi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE; + const int is_overlay = + cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE; if (is_overlay) { memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame)); return; @@ -351,7 +354,7 @@ static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row, const int tpl_stride = tpl_frame->stride; int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 }; const int step = 1 << block_mis_log2; - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const int mi_row_end = AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows); @@ -426,15 +429,15 @@ static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row, static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col) { - const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size; + const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; const int orig_rdmult = cpi->rd.RDMULT; - assert(IMPLIES(cpi->gf_group.size > 0, - cpi->gf_group.index < cpi->gf_group.size)); - const int gf_group_index = cpi->gf_group.index; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int gf_group_index = cpi->gf_frame_index; if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ && cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 && - cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) { + cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) { const int dr = av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult); x->rdmult = dr; @@ -451,7 +454,7 @@ static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile, MACROBLOCKD *xd = &x->e_mbd; // TODO(kyslov) Extend to 128x128 - assert(cm->seq_params.sb_size == BLOCK_64X64); + assert(cm->seq_params->sb_size == BLOCK_64X64); av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); @@ -512,7 +515,7 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile_info = &tile_data->tile_info; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; // Grade the temporal variation of the sb, the grade will be used to decide // fast mode search strategy for coding blocks @@ -557,6 +560,20 @@ static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td, sf->part_sf.partition_search_type == VAR_BASED_PARTITION); set_cb_offsets(td->mb.cb_offset, 0, 0); + // Initialize the flag to skip cdef for 64x64 blocks: if color sensitivy is + // on, set to 0 (don't skip). + if (sf->rt_sf.skip_cdef_sb) { + const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1; + for (int r = 0; r < block64_in_sb; ++r) { + for (int c = 0; c < block64_in_sb; ++c) { + const int idx_in_sb = + r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64; + if (mi[idx_in_sb]) + mi[idx_in_sb]->skip_cdef_curr_sb = + !(x->color_sensitivity_sb[0] || x->color_sensitivity_sb[1]); + } + } + } // Adjust and encode the superblock PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size); av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, @@ -599,7 +616,7 @@ static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td, if (gather_tpl_data) { if (cm->delta_q_info.delta_q_present_flag) { const int num_planes = av1_num_planes(cm); - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes); av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col); } @@ -637,7 +654,7 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile_info = &tile_data->tile_info; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + get_mi_grid_idx(&cm->mi_params, mi_row, mi_col); - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; const int num_planes = av1_num_planes(cm); int dummy_rate; int64_t dummy_dist; @@ -708,10 +725,17 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td, cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1; if (num_passes == 1) { +#if CONFIG_PARTITION_SEARCH_ORDER + av1_reset_part_sf(&cpi->sf.part_sf); + RD_STATS this_rdc; + av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row, mi_col, + sb_size, &this_rdc); +#else PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size); av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL, SB_SINGLE_PASS, NULL); +#endif // CONFIG_PARTITION_SEARCH_ORDER } else { // First pass SB_FIRST_PASS_STATS sb_fp_stats; @@ -753,7 +777,8 @@ static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td, static AOM_INLINE int is_rtc_mode(const CostUpdateFreq *cost_upd_freq, int use_non_rd_mode) { return (use_non_rd_mode && cost_upd_freq->coeff >= 2 && - cost_upd_freq->mode >= 2 && cost_upd_freq->mv >= 2); + cost_upd_freq->mode >= 2 && cost_upd_freq->mv >= 2 && + cost_upd_freq->dv >= 2); } /*!\brief Encode a superblock row by breaking it into superblocks @@ -776,9 +801,9 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td, MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_data->tile_info); - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; - const int mib_size = cm->seq_params.mib_size; - const int mib_size_log2 = cm->seq_params.mib_size_log2; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int mib_size = cm->seq_params->mib_size; + const int mib_size_log2 = cm->seq_params->mib_size_log2; const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode; const CostUpdateFreq *const cost_upd_freq = &cpi->oxcf.cost_upd_freq; @@ -833,6 +858,8 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td, av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col); // Reset color coding related parameters + x->color_sensitivity_sb[0] = 0; + x->color_sensitivity_sb[1] = 0; x->color_sensitivity[0] = 0; x->color_sensitivity[1] = 0; x->content_state_sb.source_sad = kMedSad; @@ -855,6 +882,12 @@ static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td, seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); } + // Produce the gradient data at superblock level, when intra mode pruning + // based on hog is enabled. + if (cpi->sf.intra_sf.intra_pruning_with_hog || + cpi->sf.intra_sf.chroma_intra_pruning_with_hog) + produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col); + // encode the superblock if (use_nonrd_mode) { encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip); @@ -886,10 +919,10 @@ static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) { // Copy data over into macro block data structures. av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, - cm->seq_params.sb_size); + cm->seq_params->sb_size); - av1_setup_block_planes(xd, cm->seq_params.subsampling_x, - cm->seq_params.subsampling_y, num_planes); + av1_setup_block_planes(xd, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y, num_planes); } void av1_alloc_tile_data(AV1_COMP *cpi) { @@ -927,13 +960,14 @@ void av1_init_tile_data(AV1_COMP *cpi) { TileInfo *const tile_info = &tile_data->tile_info; av1_tile_init(tile_info, cm, tile_row, tile_col); tile_data->firstpass_top_mv = kZeroMv; + tile_data->abs_sum_level = 0; if (pre_tok != NULL && tplist != NULL) { token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; pre_tok = token_info->tile_tok[tile_row][tile_col]; - tile_tok = allocated_tokens(*tile_info, - cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, - num_planes); + tile_tok = allocated_tokens( + *tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, + num_planes); token_info->tplist[tile_row][tile_col] = tplist + tplist_count; tplist = token_info->tplist[tile_row][tile_col]; tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info); @@ -961,14 +995,14 @@ void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, TokenExtra *tok = NULL; TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col]; const int sb_row_in_tile = - (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2; + (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2; const int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2; const int num_mb_rows_in_sb = - ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4; + ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4; get_start_tok(cpi, tile_row, tile_col, mi_row, &tok, - cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); + cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes); assert(tplist != NULL); tplist[sb_row_in_tile].start = tok; @@ -979,7 +1013,7 @@ void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <= get_token_alloc(num_mb_rows_in_sb, tile_mb_cols, - cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, + cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes)); (void)tile_mb_cols; @@ -1005,7 +1039,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, &td->mb.e_mbd); if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra) - cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params); + cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); if (td->mb.txfm_search_info.txb_rd_records != NULL) { av1_crc32c_calculator_init( @@ -1013,9 +1047,10 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, } for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; - mi_row += cm->seq_params.mib_size) { + mi_row += cm->seq_params->mib_size) { av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); } + this_tile->abs_sum_level = td->abs_sum_level; } /*!\brief Break one frame into tiles and encode the tiles @@ -1030,15 +1065,13 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) { const int tile_rows = cm->tiles.rows; int tile_col, tile_row; + MACROBLOCK *const mb = &cpi->td.mb; assert(IMPLIES(cpi->tile_data == NULL, cpi->allocated_tiles < tile_cols * tile_rows)); if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi); av1_init_tile_data(cpi); - if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { - cpi->td.mb.txfm_search_info.txb_rd_records = - (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords)); - } + av1_alloc_mb_data(cm, mb, cpi->sf.rt_sf.use_nonrd_pick_mode); for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { @@ -1046,6 +1079,7 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) { &cpi->tile_data[tile_row * cm->tiles.cols + tile_col]; cpi->td.intrabc_used = 0; cpi->td.deltaq_used = 0; + cpi->td.abs_sum_level = 0; cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; cpi->td.mb.tile_pb_ctx = &this_tile->tctx; // Reset cyclic refresh counters. @@ -1062,10 +1096,7 @@ static AOM_INLINE void encode_tiles(AV1_COMP *cpi) { } } - if (cpi->td.mb.txfm_search_info.txb_rd_records) { - aom_free(cpi->td.mb.txfm_search_info.txb_rd_records); - cpi->td.mb.txfm_search_info.txb_rd_records = NULL; - } + av1_dealloc_mb_data(cm, mb); } // Set the relative distance of a reference frame w.r.t. current frame @@ -1141,10 +1172,10 @@ static int check_skip_mode_enabled(AV1_COMP *const cpi) { const int cur_offset = (int)cm->current_frame.order_hint; int ref_offset[2]; get_skip_mode_ref_offsets(cm, ref_offset); - const int cur_to_ref0 = get_relative_dist(&cm->seq_params.order_hint_info, + const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info, cur_offset, ref_offset[0]); - const int cur_to_ref1 = abs(get_relative_dist(&cm->seq_params.order_hint_info, - cur_offset, ref_offset[1])); + const int cur_to_ref1 = abs(get_relative_dist( + &cm->seq_params->order_hint_info, cur_offset, ref_offset[1])); if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0; // High Latency: Turn off skip mode if all refs are fwd. @@ -1248,6 +1279,9 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; RD_COUNTS *const rdc = &cpi->td.rd_counts; FrameProbInfo *const frame_probs = &cpi->frame_probs; +#if CONFIG_FRAME_PARALLEL_ENCODE + FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs; +#endif // CONFIG_FRAME_PARALLEL_ENCODE IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; @@ -1278,9 +1312,15 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { if (features->allow_warped_motion && cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); - if (frame_probs->warped_probs[update_type] < - cpi->sf.inter_sf.prune_warped_prob_thresh) + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int warped_probability; +#if CONFIG_FRAME_PARALLEL_ENCODE + warped_probability = temp_frame_probs->warped_probs[update_type]; +#else + warped_probability = frame_probs->warped_probs[update_type]; +#endif + if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh) features->allow_warped_motion = 0; } @@ -1316,7 +1356,7 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { // Hash data generated for screen contents is used for intraBC ME const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize]; const int max_sb_size = - (1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)); + (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)); int src_idx = 0; for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) { const int dst_idx = !src_idx; @@ -1377,10 +1417,10 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { // is used for ineligible frames. That effectively will turn off row_mt // usage. Note objective delta_q and tpl eligible frames are only altref // frames currently. - const GF_GROUP *gf_group = &cpi->gf_group; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; if (cm->delta_q_info.delta_q_present_flag) { if (deltaq_mode == DELTA_Q_OBJECTIVE && - !is_frame_tpl_eligible(gf_group, gf_group->index)) + !is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) cm->delta_q_info.delta_q_present_flag = 0; } @@ -1500,8 +1540,8 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { features->tx_mode = select_tx_mode(cm, tx_search_type); if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); - + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < TX_SIZES_ALL; i++) { int sum = 0; int j; @@ -1519,13 +1559,33 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { left -= prob; if (j == 0) prob += left; frame_probs->tx_type_probs[update_type][i][j] = prob; +#if CONFIG_FRAME_PARALLEL_ENCODE + /* TODO(FPMT): The current update is happening in cpi->frame_probs, + * this need to be taken care appropriately in final FPMT implementation + * to carry these values to subsequent frames. The frame_probs update is + * accumulated across frames, so the values from all individual parallel + * frames need to be taken into account after all the parallel frames + * are encoded. + * + * Only for quality simulation purpose - Update the accumulated frame + * probabilities in ppi->temp_variable based on the update flag. + */ + if (cpi->do_frame_data_update) { + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->tx_type_probs[update_type_idx][i][j] = + frame_probs->tx_type_probs[update_type_idx][i][j]; + } + } +#endif // CONFIG_FRAME_PARALLEL_ENCODE } } } if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < BLOCK_SIZES_ALL; i++) { int sum = 0; @@ -1535,23 +1595,63 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0; frame_probs->obmc_probs[update_type][i] = (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1; +#if CONFIG_FRAME_PARALLEL_ENCODE + /* TODO(FPMT): The current update is happening in cpi->frame_probs, + * this need to be taken care appropriately in final FPMT + * implementation to carry these values to subsequent frames. + * The frame_probs update is accumulated across frames, so the + * values from all individual parallel frames need to be taken + * into account after all the parallel frames are encoded. + * + * Only for quality simulation purpose - Update the accumulated frame + * probabilities in ppi->temp_variable based on the update flag. + */ + if (cpi->do_frame_data_update) { + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->obmc_probs[update_type_idx][i] = + frame_probs->obmc_probs[update_type_idx][i]; + } + } +#endif // CONFIG_FRAME_PARALLEL_ENCODE } } if (features->allow_warped_motion && cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); int sum = 0; for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i]; const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0; frame_probs->warped_probs[update_type] = (frame_probs->warped_probs[update_type] + new_prob) >> 1; +#if CONFIG_FRAME_PARALLEL_ENCODE + /* TODO(FPMT): The current update is happening in cpi->frame_probs, + * this need to be taken care appropriately in final FPMT + * implementation to carry these values to subsequent frames. + * The frame_probs update is accumulated across frames, so the + * values from all individual parallel frames need to be taken + * into account after all the parallel frames are encoded. + * + * Only for quality simulation purpose - Update the accumulated frame + * probabilities in ppi->temp_variable based on the update flag. + */ + if (cpi->do_frame_data_update) { + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->warped_probs[update_type_idx] = + frame_probs->warped_probs[update_type_idx]; + } + } +#endif // CONFIG_FRAME_PARALLEL_ENCODE } if (cm->current_frame.frame_type != KEY_FRAME && cpi->sf.interp_sf.adaptive_interp_filter_search == 2 && features->interp_filter == SWITCHABLE) { - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { int sum = 0; @@ -1572,6 +1672,25 @@ static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) { left -= prob; if (j == 0) prob += left; frame_probs->switchable_interp_probs[update_type][i][j] = prob; +#if CONFIG_FRAME_PARALLEL_ENCODE + /* TODO(FPMT): The current update is happening in cpi->frame_probs, + * this need to be taken care appropriately in final FPMT + * implementation to carry these values to subsequent frames. + * The frame_probs update is accumulated across frames, so the + * values from all individual parallel frames need to be taken + * into account after all the parallel frames are encoded. + * + * Only for quality simulation purpose - Update the accumulated frame + * probabilities in ppi->temp_variable based on the update flag. + */ + if (cpi->do_frame_data_update) { + for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES; + update_type_idx++) { + temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] = + frame_probs->switchable_interp_probs[update_type_idx][i][j]; + } + } +#endif // CONFIG_FRAME_PARALLEL_ENCODE } } } diff --git a/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.c b/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.c index c10b2ffe6c..d3fa50292b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.c +++ b/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.c @@ -44,7 +44,6 @@ void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM); - aom_clear_system_state(); for (row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (col = mi_col / num_mi_h; @@ -59,20 +58,19 @@ void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5); *rdmult = AOMMAX(*rdmult, 0); av1_set_error_per_bit(errorperbit, *rdmult); - aom_clear_system_state(); } // Return the end column for the current superblock, in unit of TPL blocks. static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col, int num_mi_w) { // Find the start column of this superblock. - const int sb_mi_col_start = (mi_col >> cm->seq_params.mib_size_log2) - << cm->seq_params.mib_size_log2; + const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2) + << cm->seq_params->mib_size_log2; // Same but in superres upscaled dimension. const int sb_mi_col_start_sr = coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator); // Width of this superblock in mi units. - const int sb_mi_width = mi_size_wide[cm->seq_params.sb_size]; + const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size]; // Same but in superres upscaled dimension. const int sb_mi_width_sr = coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator); @@ -86,15 +84,16 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize, const int mi_row, const int mi_col, int orig_rdmult) { const AV1_COMMON *const cm = &cpi->common; - const GF_GROUP *const gf_group = &cpi->gf_group; - assert(IMPLIES(cpi->gf_group.size > 0, - cpi->gf_group.index < cpi->gf_group.size)); - const int tpl_idx = cpi->gf_group.index; - const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx]; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; const int deltaq_rdmult = set_deltaq_rdmult(cpi, x); - if (tpl_frame->is_valid == 0) return deltaq_rdmult; - if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return deltaq_rdmult; if (tpl_idx >= MAX_TPL_FRAME_IDX) return deltaq_rdmult; + const TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx]; + if (!tpl_frame->is_valid) return deltaq_rdmult; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) + return deltaq_rdmult; if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult; const int mi_col_sr = @@ -116,7 +115,6 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, int row, col; double base_block_count = 0.0; double geom_mean_of_scale = 0.0; - aom_clear_system_state(); for (row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (col = mi_col_sr / num_mi_h; @@ -124,7 +122,7 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, col < sb_bcol_end; ++col) { const int index = row * num_cols + col; - geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]); + geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]); base_block_count += 1.0; } } @@ -132,8 +130,7 @@ int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x, int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5); rdmult = AOMMAX(rdmult, 0); av1_set_error_per_bit(&x->errorperbit, rdmult); - aom_clear_system_state(); - if (bsize == cm->seq_params.sb_size) { + if (bsize == cm->seq_params->sb_size) { const int rdmult_sb = set_deltaq_rdmult(cpi, x); assert(rdmult_sb == rdmult); (void)rdmult_sb; @@ -341,7 +338,7 @@ void av1_update_state(const AV1_COMP *const cpi, ThreadData *td, const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col); const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row); - if (cm->seq_params.order_hint_info.enable_ref_frame_mvs) + if (cm->seq_params->order_hint_info.enable_ref_frame_mvs) av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis); } @@ -604,9 +601,9 @@ static void set_partial_sb_partition(const AV1_COMMON *const cm, MB_MODE_INFO **mib) { int bh = bh_in; int r, c; - for (r = 0; r < cm->seq_params.mib_size; r += bh) { + for (r = 0; r < cm->seq_params->mib_size; r += bh) { int bw = bw_in; - for (c = 0; c < cm->seq_params.mib_size; c += bw) { + for (c = 0; c < cm->seq_params->mib_size; c += bw) { const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c); const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c); mib[grid_index] = mi + mi_index; @@ -638,11 +635,11 @@ void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile, assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0)); // Apply the requested partition size to the SB if it is all "in image" - if ((mi_cols_remaining >= cm->seq_params.mib_size) && - (mi_rows_remaining >= cm->seq_params.mib_size)) { - for (int block_row = 0; block_row < cm->seq_params.mib_size; + if ((mi_cols_remaining >= cm->seq_params->mib_size) && + (mi_rows_remaining >= cm->seq_params->mib_size)) { + for (int block_row = 0; block_row < cm->seq_params->mib_size; block_row += bh) { - for (int block_col = 0; block_col < cm->seq_params.mib_size; + for (int block_col = 0; block_col < cm->seq_params->mib_size; block_col += bw) { const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col); const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col); @@ -682,25 +679,25 @@ int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col, int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int orig_rdmult) { AV1_COMMON *const cm = &cpi->common; - const GF_GROUP *const gf_group = &cpi->gf_group; - assert(IMPLIES(cpi->gf_group.size > 0, - cpi->gf_group.index < cpi->gf_group.size)); - const int tpl_idx = cpi->gf_group.index; - TplParams *const tpl_data = &cpi->tpl_data; - TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; - TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; - int tpl_stride = tpl_frame->stride; int64_t intra_cost = 0; int64_t mc_dep_cost = 0; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; - if (tpl_frame->is_valid == 0) return orig_rdmult; + if (tpl_idx >= MAX_TPL_FRAME_IDX) return orig_rdmult; - if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return orig_rdmult; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + if (!tpl_frame->is_valid) return orig_rdmult; - if (cpi->gf_group.index >= MAX_TPL_FRAME_IDX) return orig_rdmult; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return orig_rdmult; int mi_count = 0; const int mi_col_sr = @@ -727,8 +724,6 @@ int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, } assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); - aom_clear_system_state(); - double beta = 1.0; if (mc_dep_cost > 0 && intra_cost > 0) { const double r0 = cpi->rd.r0; @@ -738,8 +733,6 @@ int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int rdmult = av1_get_adaptive_rdmult(cpi, beta); - aom_clear_system_state(); - rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2); rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2); @@ -760,7 +753,7 @@ int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) { if (is_stat_consumption_stage_twopass(cpi)) { const AV1_COMMON *const cm = &cpi->common; const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( - &cpi->twopass, cm->current_frame.display_order_hint); + &cpi->ppi->twopass, cm->current_frame.display_order_hint); if (this_frame_stats == NULL) return AOM_CODEC_ERROR; // The inactive region is specified in MBs not mi units. @@ -790,7 +783,7 @@ int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { if (is_stat_consumption_stage_twopass(cpi)) { const AV1_COMMON *const cm = &cpi->common; const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats( - &cpi->twopass, cm->current_frame.display_order_hint); + &cpi->ppi->twopass, cm->current_frame.display_order_hint); if (this_frame_stats == NULL) return AOM_CODEC_ERROR; // The inactive region is specified in MBs not mi units. @@ -814,24 +807,26 @@ void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, if (!cpi->oxcf.algo_cfg.enable_tpl_model) return; if (cpi->common.current_frame.frame_type == KEY_FRAME) return; - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE) return; - assert(IMPLIES(cpi->gf_group.size > 0, - cpi->gf_group.index < cpi->gf_group.size)); + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); AV1_COMMON *const cm = &cpi->common; - const int gf_group_index = cpi->gf_group.index; - TplParams *const tpl_data = &cpi->tpl_data; - TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index]; - TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; - int tpl_stride = tpl_frame->stride; + const int gf_group_index = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; - if (tpl_frame->is_valid == 0) return; if (gf_group_index >= MAX_TPL_FRAME_IDX) return; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + if (!tpl_frame->is_valid) return; + int mi_count = 0; int count = 0; const int mi_col_sr = @@ -889,26 +884,26 @@ void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; - const GF_GROUP *const gf_group = &cpi->gf_group; - assert(IMPLIES(cpi->gf_group.size > 0, - cpi->gf_group.index < cpi->gf_group.size)); - const int tpl_idx = cpi->gf_group.index; - TplParams *const tpl_data = &cpi->tpl_data; - TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; - TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; - int tpl_stride = tpl_frame->stride; int64_t intra_cost = 0; int64_t mc_dep_cost = 0; const int mi_wide = mi_size_wide[bsize]; const int mi_high = mi_size_high[bsize]; const int base_qindex = cm->quant_params.base_qindex; - if (tpl_frame->is_valid == 0) return base_qindex; + if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex; - if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return base_qindex; + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + if (!tpl_frame->is_valid) return base_qindex; - if (cpi->gf_group.index >= MAX_TPL_FRAME_IDX) return base_qindex; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return base_qindex; int mi_count = 0; const int mi_col_sr = @@ -935,8 +930,6 @@ int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize, } assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB); - aom_clear_system_state(); - int offset = 0; double beta = 1.0; if (mc_dep_cost > 0 && intra_cost > 0) { @@ -945,8 +938,7 @@ int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize, beta = (r0 / rk); assert(beta > 0.0); } - offset = av1_get_deltaq_offset(cpi, base_qindex, beta); - aom_clear_system_state(); + offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1); @@ -1164,7 +1156,7 @@ void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr, void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int offset) { unsigned int tmp_sse; unsigned int tmp_variance; - const BLOCK_SIZE bsize = cpi->common.seq_params.sb_size; + const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size; uint8_t *src_y = cpi->source->y_buffer; int src_ystride = cpi->source->y_stride; uint8_t *last_src_y = cpi->last_source->y_buffer; @@ -1178,8 +1170,8 @@ void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int offset) { #endif src_y += offset; last_src_y += offset; - tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, - last_src_ystride, &tmp_sse); + tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, + last_src_ystride, &tmp_sse); if (tmp_sse < avg_source_sse_threshold) x->content_state_sb.source_sad = kLowSad; else if (tmp_sse > avg_source_sse_threshold_high) @@ -1233,7 +1225,7 @@ void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi, const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; xd->above_txfm_context = cm->above_contexts.txfm[tile_info->tile_row] + mi_col; @@ -1269,7 +1261,7 @@ void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi, const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes); @@ -1294,33 +1286,32 @@ void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi, #endif // CONFIG_INTERNAL_STATS } -// Checks for skip status of mv cost update. -static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, - const int mi_row, const int mi_col) { - // For intra frames, mv cdfs are not updated during the encode. Hence, the mv - // cost calculation is skipped in this case. - if (frame_is_intra_only(&cpi->common)) return 1; - // mv_cost_upd_level=0: update happens at each sb, - // so return skip status as 0. - // mv_cost_upd_level=1: update happens once for each sb row, - // so return skip status as 1 for - // mi_col != tile_info->mi_col_start. - // mv_cost_upd_level=2: update happens once for a set of rows, - // so return skip status as 1 appropriately. - if (!cpi->sf.inter_sf.mv_cost_upd_level) return 0; +/*! Checks whether to skip updating the entropy cost based on tile info. + * + * This function contains codes common to both \ref skip_mv_cost_update and + * \ref skip_dv_cost_update. + */ +static int skip_cost_update(const SequenceHeader *seq_params, + const TileInfo *const tile_info, const int mi_row, + const int mi_col, + INTERNAL_COST_UPDATE_TYPE upd_level) { + if (upd_level == INTERNAL_COST_UPD_SB) return 0; + if (upd_level == INTERNAL_COST_UPD_OFF) return 1; + + // upd_level is at most as frequent as each sb_row in a tile. if (mi_col != tile_info->mi_col_start) return 1; - if (cpi->sf.inter_sf.mv_cost_upd_level == 2) { - AV1_COMMON *const cm = &cpi->common; - const int mib_size_log2 = cm->seq_params.mib_size_log2; + + if (upd_level == INTERNAL_COST_UPD_SBROW_SET) { + const int mib_size_log2 = seq_params->mib_size_log2; const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; - const int sb_size = cm->seq_params.mib_size * MI_SIZE; + const int sb_size = seq_params->mib_size * MI_SIZE; const int tile_height = (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; - // When mv_cost_upd_level = 2, the cost update happens once for 2, 4 sb - // rows for sb size 128, sb size 64 respectively. However, as the update - // will not be equally spaced in smaller resolutions making it equally - // spaced by calculating (mv_num_rows_cost_update) the number of rows - // after which the cost update should happen. + // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens + // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However, + // as the update will not be equally spaced in smaller resolutions making + // it equally spaced by calculating (mv_num_rows_cost_update) the number of + // rows after which the cost update should happen. const int sb_size_update_freq_map[2] = { 2, 4 }; const int update_freq_sb_rows = sb_size_update_freq_map[sb_size != MAX_SB_SIZE]; @@ -1337,6 +1328,32 @@ static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, return 0; } +// Checks for skip status of mv cost update. +static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, + const int mi_row, const int mi_col) { + const AV1_COMMON *cm = &cpi->common; + // For intra frames, mv cdfs are not updated during the encode. Hence, the mv + // cost calculation is skipped in this case. + if (frame_is_intra_only(cm)) return 1; + + return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.inter_sf.mv_cost_upd_level); +} + +// Checks for skip status of dv cost update. +static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info, + const int mi_row, const int mi_col) { + const AV1_COMMON *cm = &cpi->common; + // Intrabc is only applicable to intra frames. So skip if intrabc is not + // allowed. + if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) { + return 1; + } + + return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col, + cpi->sf.intra_sf.dv_cost_upd_level); +} + // Update the rate costs of some symbols according to the frequency directed // by speed features void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, @@ -1355,6 +1372,9 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, if (mi_col != tile_info->mi_col_start) break; AOM_FALLTHROUGH_INTENDED; case COST_UPD_SB: // SB level + if (cpi->sf.inter_sf.coeff_cost_upd_level == INTERNAL_COST_UPD_SBROW && + mi_col != tile_info->mi_col_start) + break; av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes); break; default: assert(0); @@ -1368,6 +1388,9 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, if (mi_col != tile_info->mi_col_start) break; AOM_FALLTHROUGH_INTENDED; case COST_UPD_SB: // SB level + if (cpi->sf.inter_sf.mode_cost_upd_level == INTERNAL_COST_UPD_SBROW && + mi_col != tile_info->mi_col_start) + break; av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx); break; default: assert(0); @@ -1388,4 +1411,19 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, break; default: assert(0); } + + switch (cpi->oxcf.cost_upd_freq.dv) { + case COST_UPD_OFF: + case COST_UPD_TILE: // Tile level + break; + case COST_UPD_SBROW: // SB row level in tile + if (mi_col != tile_info->mi_col_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SB: // SB level + // Checks for skip status of dv cost update. + if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break; + av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs); + break; + default: assert(0); + } } diff --git a/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.h b/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.h index 7bdfad5cba..3096181885 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.h +++ b/third_party/libaom/source/libaom/av1/encoder/encodeframe_utils.h @@ -13,17 +13,68 @@ #define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_ #include "aom_ports/aom_timer.h" +#include "aom_ports/system_state.h" #include "av1/common/reconinter.h" #include "av1/encoder/encoder.h" -#include "av1/encoder/partition_strategy.h" #include "av1/encoder/rdopt.h" #ifdef __cplusplus extern "C" { #endif +#define WRITE_FEATURE_TO_FILE 0 + +#define FEATURE_SIZE_SMS_SPLIT_FAST 6 +#define FEATURE_SIZE_SMS_SPLIT 17 +#define FEATURE_SIZE_SMS_PRUNE_PART 25 +#define FEATURE_SIZE_SMS_TERM_NONE 28 +#define FEATURE_SIZE_FP_SMS_TERM_NONE 20 +#define FEATURE_SIZE_MAX_MIN_PART_PRED 13 +#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4 + +#define FEATURE_SMS_NONE_FLAG 1 +#define FEATURE_SMS_SPLIT_FLAG (1 << 1) +#define FEATURE_SMS_RECT_FLAG (1 << 2) + +#define FEATURE_SMS_PRUNE_PART_FLAG \ + (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG) +#define FEATURE_SMS_SPLIT_MODEL_FLAG \ + (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG) + +// Number of sub-partitions in rectangular partition types. +#define SUB_PARTITIONS_RECT 2 + +// Number of sub-partitions in split partition type. +#define SUB_PARTITIONS_SPLIT 4 + +// Number of sub-partitions in AB partition types. +#define SUB_PARTITIONS_AB 3 + +// Number of sub-partitions in 4-way partition types. +#define SUB_PARTITIONS_PART4 4 + +// 4part parition types. +enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES); + +// AB parition types. +enum { + HORZ_A = 0, + HORZ_B, + VERT_A, + VERT_B, + NUM_AB_PARTS +} UENUM1BYTE(AB_PART_TYPE); + +// Rectangular parition types. +enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE); + +// Structure to keep win flags for HORZ and VERT partition evaluations. +typedef struct { + int rect_part_win[NUM_RECT_PARTS]; +} RD_RECT_PART_WIN_INFO; + enum { PICK_MODE_RD = 0, PICK_MODE_NONRD }; enum { @@ -218,47 +269,6 @@ static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, return &p->stats_buf_ctx->stats_in_start[frm]; } -static BLOCK_SIZE dim_to_size(int dim) { - switch (dim) { - case 4: return BLOCK_4X4; - case 8: return BLOCK_8X8; - case 16: return BLOCK_16X16; - case 32: return BLOCK_32X32; - case 64: return BLOCK_64X64; - case 128: return BLOCK_128X128; - default: assert(0); return 0; - } -} - -static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc, - AV1_COMP *cpi, MACROBLOCK *x, - const SPEED_FEATURES *sf, - BLOCK_SIZE sb_size, - int mi_row, int mi_col) { - const AV1_COMMON *cm = &cpi->common; - - sb_enc->max_partition_size = - AOMMIN(sf->part_sf.default_max_partition_size, - dim_to_size(cpi->oxcf.part_cfg.max_partition_size)); - sb_enc->min_partition_size = - AOMMAX(sf->part_sf.default_min_partition_size, - dim_to_size(cpi->oxcf.part_cfg.min_partition_size)); - sb_enc->max_partition_size = - AOMMIN(sb_enc->max_partition_size, cm->seq_params.sb_size); - sb_enc->min_partition_size = - AOMMIN(sb_enc->min_partition_size, cm->seq_params.sb_size); - - if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) { - float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f }; - - av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features); - sb_enc->max_partition_size = - AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features), - sb_enc->max_partition_size), - sb_enc->min_partition_size); - } -} - int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int orig_rdmult); @@ -335,6 +345,57 @@ void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile_info, const int mi_row, const int mi_col); +static AOM_INLINE void av1_dealloc_mb_data(struct AV1Common *cm, + struct macroblock *mb) { + if (mb->txfm_search_info.txb_rd_records) { + aom_free(mb->txfm_search_info.txb_rd_records); + mb->txfm_search_info.txb_rd_records = NULL; + } + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; plane++) { + if (mb->plane[plane].src_diff) { + aom_free(mb->plane[plane].src_diff); + mb->plane[plane].src_diff = NULL; + } + } + if (mb->e_mbd.seg_mask) { + aom_free(mb->e_mbd.seg_mask); + mb->e_mbd.seg_mask = NULL; + } + if (mb->winner_mode_stats) { + aom_free(mb->winner_mode_stats); + mb->winner_mode_stats = NULL; + } +} + +static AOM_INLINE void av1_alloc_mb_data(struct AV1Common *cm, + struct macroblock *mb, + int use_nonrd_pick_mode) { + if (!use_nonrd_pick_mode) { + mb->txfm_search_info.txb_rd_records = + (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords)); + } + const int num_planes = av1_num_planes(cm); + for (int plane = 0; plane < num_planes; plane++) { + const int subsampling_xy = + plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y + : 0; + const int sb_size = MAX_SB_SQUARE >> subsampling_xy; + CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff, + (int16_t *)aom_memalign( + 32, sizeof(*mb->plane[plane].src_diff) * sb_size)); + } + CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask, + (uint8_t *)aom_memalign( + 16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0]))); + const int winner_mode_count = frame_is_intra_only(cm) + ? MAX_WINNER_MODE_COUNT_INTRA + : MAX_WINNER_MODE_COUNT_INTER; + CHECK_MEM_ERROR(cm, mb->winner_mode_stats, + (WinnerModeStats *)aom_malloc( + winner_mode_count * sizeof(mb->winner_mode_stats[0]))); +} + // This function will compute the number of reference frames to be disabled // based on selective_ref_frame speed feature. static AOM_INLINE unsigned int get_num_refs_to_disable( @@ -359,7 +420,7 @@ static AOM_INLINE unsigned int get_num_refs_to_disable( #if !CONFIG_REALTIME_ONLY else if (is_stat_consumption_stage_twopass(cpi)) { const FIRSTPASS_STATS *const this_frame_stats = - read_one_frame_stats(&cpi->twopass, cur_frame_display_index); + read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index); aom_clear_system_state(); const double coded_error_per_mb = this_frame_stats->coded_error / cpi->frame_info.num_mbs; diff --git a/third_party/libaom/source/libaom/av1/encoder/encodemb.c b/third_party/libaom/source/libaom/av1/encoder/encodemb.c index c9ee22034b..2a875e1223 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encodemb.c +++ b/third_party/libaom/source/libaom/av1/encoder/encodemb.c @@ -35,19 +35,19 @@ #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" -void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src8, ptrdiff_t src_stride, - const uint8_t *pred8, ptrdiff_t pred_stride) { +void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride) { assert(rows >= 4 && cols >= 4); #if CONFIG_AV1_HIGHBITDEPTH - if (is_cur_buf_hbd(xd)) { + if (bd_info.use_highbitdepth_buf) { aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, - pred8, pred_stride, xd->bd); + pred8, pred_stride, bd_info.bit_depth); return; } #endif - (void)xd; + (void)bd_info; aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride); } @@ -55,6 +55,7 @@ void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols, void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int blk_col, int blk_row, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; const int diff_stride = block_size_wide[plane_bsize]; @@ -66,8 +67,8 @@ void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; int16_t *src_diff = &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2]; - av1_subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src, - src_stride, dst, dst_stride); + av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride, + src, src_stride, dst, dst_stride); } void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) { @@ -77,9 +78,10 @@ void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) { const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; const MACROBLOCKD *xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); - av1_subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride); + av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride); } int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, @@ -132,13 +134,8 @@ const int DROPOUT_MULTIPLIER_Q_BASE = 32; // Base Q to compute multiplier. void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, int qindex) { - const struct macroblock_plane *const p = &mb->plane[plane]; - tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); - tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); const int tx_width = tx_size_wide[tx_size]; const int tx_height = tx_size_high[tx_size]; - const int max_eob = av1_get_max_eob(tx_size); - const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); // Early return if `qindex` is out of range. if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) { @@ -156,6 +153,19 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, multiplier * CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX); + av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before, + dropout_num_after); +} + +void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, TX_TYPE tx_type, + int dropout_num_before, int dropout_num_after) { + const struct macroblock_plane *const p = &mb->plane[plane]; + tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block); + tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block); + const int max_eob = av1_get_max_eob(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + // Early return if there are not enough non-zero coefficients. if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before) { return; @@ -172,7 +182,8 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, for (int i = 0; i < p->eobs[block]; ++i) { const int scan_idx = scan_order->scan[i]; - if (qcoeff[scan_idx] > DROPOUT_COEFF_MAX) { // Keep large coefficients. + if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) { + // Keep large coefficients. count_zeros_before = 0; count_zeros_after = 0; idx = -1; @@ -197,6 +208,7 @@ void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, if (count_nonzeros > DROPOUT_CONTINUITY_MAX) { count_zeros_before = 0; count_zeros_after = 0; + count_nonzeros = 0; idx = -1; eob = i + 1; } @@ -513,15 +525,17 @@ static void encode_block_inter(int plane, int block, int blk_row, int blk_col, const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int step = bsh * bsw; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); assert(bsw > 0 && bsh > 0); - for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { - for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { - const int offsetr = blk_row + row; + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { const int offsetc = blk_col + col; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs, arg, dry_run); block += step; diff --git a/third_party/libaom/source/libaom/av1/encoder/encodemb.h b/third_party/libaom/source/libaom/av1/encoder/encodemb.h index fcd34a3908..f2dc956a65 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encodemb.h +++ b/third_party/libaom/source/libaom/av1/encoder/encodemb.h @@ -123,11 +123,16 @@ int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane, // `txb_entropy_ctx`, which `mb` points to, may be modified by this function. void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, int qindex); - -void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src8, ptrdiff_t src_stride, - const uint8_t *pred8, ptrdiff_t pred_stride); +// Same as above, with the number of zeroes needed before/after a coeff to drop +// it explicitly passed in, instead of being derived from qindex. +void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, TX_TYPE tx_type, + int dropout_num_before, int dropout_num_after); + +void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride); void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int blk_col, int blk_row, TX_SIZE tx_size); diff --git a/third_party/libaom/source/libaom/av1/encoder/encodemv.c b/third_party/libaom/source/libaom/av1/encoder/encodemv.c index 86c6156d8f..4a7d87408c 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encodemv.c +++ b/third_party/libaom/source/libaom/av1/encoder/encodemv.c @@ -173,8 +173,8 @@ static void build_nmv_component_cost_table(int *mvcost, } } -void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, - nmv_context *mvctx, int usehp) { +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv, + const MV *ref, nmv_context *mvctx, int usehp) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; const MV_JOINT_TYPE j = av1_get_mv_joint(&diff); // If the mv_diff is zero, then we should have used near or nearest instead. @@ -193,8 +193,7 @@ void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, // motion vector component used. if (cpi->sf.mv_sf.auto_mv_step_size) { int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3; - cpi->mv_search_params.max_mv_magnitude = - AOMMAX(maxv, cpi->mv_search_params.max_mv_magnitude); + td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude); } } diff --git a/third_party/libaom/source/libaom/av1/encoder/encodemv.h b/third_party/libaom/source/libaom/av1/encoder/encodemv.h index 9f0d607295..962844bc79 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encodemv.h +++ b/third_party/libaom/source/libaom/av1/encoder/encodemv.h @@ -18,8 +18,8 @@ extern "C" { #endif -void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref, - nmv_context *mvctx, int usehp); +void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv, + const MV *ref, nmv_context *mvctx, int usehp); void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx, MvSubpelPrecision precision); diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder.c b/third_party/libaom/source/libaom/av1/encoder/encoder.c index 955d15631c..41122ef45b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encoder.c +++ b/third_party/libaom/source/libaom/av1/encoder/encoder.c @@ -51,6 +51,7 @@ #include "av1/encoder/aq_variance.h" #include "av1/encoder/bitstream.h" #include "av1/encoder/context_tree.h" +#include "av1/encoder/dwt.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encode_strategy.h" @@ -81,10 +82,6 @@ #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 -#if CONFIG_ENTROPY_STATS -FRAME_COUNTS aggregate_fc; -#endif // CONFIG_ENTROPY_STATS - // #define OUTPUT_YUV_REC #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; @@ -228,7 +225,7 @@ double av1_get_compression_ratio(const AV1_COMMON *const cm, const int upscaled_width = cm->superres_upscaled_width; const int height = cm->height; const int luma_pic_size = upscaled_width * height; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const BITSTREAM_PROFILE profile = seq_params->profile; const int pic_size_profile_factor = profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36); @@ -242,7 +239,7 @@ double av1_get_compression_ratio(const AV1_COMMON *const cm, static void set_tile_info(AV1_COMMON *const cm, const TileConfig *const tile_cfg) { const CommonModeInfoParams *const mi_params = &cm->mi_params; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; CommonTileParams *const tiles = &cm->tiles; int i, start_sb; @@ -298,7 +295,7 @@ void av1_update_frame_size(AV1_COMP *cpi) { // We need to reallocate the context buffers here in case we need more mis. if (av1_alloc_context_buffers(cm, cm->width, cm->height)) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } av1_init_mi_buffers(&cm->mi_params); @@ -308,8 +305,10 @@ void av1_update_frame_size(AV1_COMP *cpi) { if (!is_stat_generation_stage(cpi)) alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info); - if (!cpi->seq_params_locked) - set_sb_size(&cm->seq_params, av1_select_sb_size(cpi)); + if (!cpi->ppi->seq_params_locked) + set_sb_size(cm->seq_params, + av1_select_sb_size(&cpi->oxcf, cm->width, cm->height, + cpi->svc.number_spatial_layers)); set_tile_info(cm, &cpi->oxcf.tile_cfg); } @@ -327,9 +326,9 @@ static INLINE int does_level_match(int width, int height, double fps, height <= lvl_height * lvl_dim_mult; } -static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm, - int width, int height, - double init_framerate) { +static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width, + int height, double init_framerate) { + SequenceHeader *const seq_params = &ppi->seq_params; // TODO(any): This is a placeholder function that only addresses dimensions // and max display sample rates. // Need to add checks for max bit rate, max decoded luma sample rate, header @@ -372,26 +371,26 @@ static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm, level = SEQ_LEVEL_6_2; } - SequenceHeader *const seq_params = &cm->seq_params; for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { - seq->seq_level_idx[i] = level; + seq_params->seq_level_idx[i] = level; // Set the maximum parameters for bitrate and buffer size for this profile, // level, and tier seq_params->op_params[i].bitrate = av1_max_level_bitrate( - cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]); + seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]); // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the // check if (seq_params->op_params[i].bitrate == 0) aom_internal_error( - &cm->error, AOM_CODEC_UNSUP_BITSTREAM, + &ppi->error, AOM_CODEC_UNSUP_BITSTREAM, "AV1 does not support this combination of profile, level, and tier."); // Buffer size in bits/s is bitrate in bits/s * 1 s seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate; } } -void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm, +void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi, const AV1EncoderConfig *oxcf, int use_svc) { + SequenceHeader *const seq = &ppi->seq_params; const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; const ToolCfg *const tool_cfg = &oxcf->tool_cfg; @@ -449,7 +448,7 @@ void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm, seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter; seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra; - set_bitstream_level_tier(seq, cm, frm_dim_cfg->width, frm_dim_cfg->height, + set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height, oxcf->input_cfg.init_framerate); if (seq->operating_points_cnt_minus_1 == 0) { @@ -461,26 +460,27 @@ void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm, // skip decoding enhancement layers (temporal first). int i = 0; assert(seq->operating_points_cnt_minus_1 == - (int)(cm->number_spatial_layers * cm->number_temporal_layers - 1)); - for (unsigned int sl = 0; sl < cm->number_spatial_layers; sl++) { - for (unsigned int tl = 0; tl < cm->number_temporal_layers; tl++) { + (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1)); + for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) { + for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) { seq->operating_point_idc[i] = - (~(~0u << (cm->number_spatial_layers - sl)) << 8) | - ~(~0u << (cm->number_temporal_layers - tl)); + (~(~0u << (ppi->number_spatial_layers - sl)) << 8) | + ~(~0u << (ppi->number_temporal_layers - tl)); i++; } } } } -static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { - AV1_COMMON *const cm = &cpi->common; - SequenceHeader *const seq_params = &cm->seq_params; - ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; +static void init_config_sequence(struct AV1_PRIMARY *ppi, + AV1EncoderConfig *oxcf) { + SequenceHeader *const seq_params = &ppi->seq_params; const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; const ColorCfg *const color_cfg = &oxcf->color_cfg; - cpi->oxcf = *oxcf; - cpi->framerate = oxcf->input_cfg.init_framerate; + + ppi->use_svc = 0; + ppi->number_spatial_layers = 1; + ppi->number_temporal_layers = 1; seq_params->profile = oxcf->profile; seq_params->bit_depth = oxcf->tool_cfg.bit_depth; @@ -508,7 +508,7 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { // set the decoder model parameters in schedule mode seq_params->decoder_model_info.num_units_in_decoding_tick = dec_model_cfg->num_units_in_decoding_tick; - cm->buffer_removal_time_present = 1; + ppi->buffer_removal_time_present = 1; av1_set_aom_dec_model_info(&seq_params->decoder_model_info); av1_set_dec_model_op_parameters(&seq_params->op_params[0]); } else if (seq_params->timing_info_present && @@ -546,11 +546,19 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { } } } + av1_change_config_seq(ppi, oxcf, NULL); +} + +static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { + AV1_COMMON *const cm = &cpi->common; + ResizePendingParams *resize_pending_params = &cpi->resize_pending_params; + + cpi->oxcf = *oxcf; + cpi->framerate = oxcf->input_cfg.init_framerate; cm->width = oxcf->frm_dim_cfg.width; cm->height = oxcf->frm_dim_cfg.height; - set_sb_size(seq_params, - av1_select_sb_size(cpi)); // set sb size before allocations + alloc_compressor_data(cpi); av1_update_film_grain_parameters(cpi, oxcf); @@ -559,18 +567,15 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { cpi->td.counts = &cpi->counts; // Set init SVC parameters. - cpi->use_svc = 0; - cpi->svc.external_ref_frame_config = 0; + cpi->svc.set_ref_frame_config = 0; cpi->svc.non_reference_frame = 0; cpi->svc.number_spatial_layers = 1; cpi->svc.number_temporal_layers = 1; - cm->number_spatial_layers = 1; - cm->number_temporal_layers = 1; cm->spatial_layer_id = 0; cm->temporal_layer_id = 0; // change includes all joint functionality - av1_change_config(cpi, oxcf); + av1_change_config(cpi, oxcf, true); cpi->ref_frame_flags = 0; @@ -583,25 +588,13 @@ static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) { av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); } -void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { - AV1_COMMON *const cm = &cpi->common; - SequenceHeader *const seq_params = &cm->seq_params; - RATE_CONTROL *const rc = &cpi->rc; - MACROBLOCK *const x = &cpi->td.mb; - AV1LevelParams *const level_params = &cpi->level_params; - InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; - RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; - const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg; +void av1_change_config_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf, + bool *is_sb_size_changed) { + SequenceHeader *const seq_params = &ppi->seq_params; + const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg; const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg; const ColorCfg *const color_cfg = &oxcf->color_cfg; - const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; - // in case of LAP, lag in frames is set according to number of lap buffers - // calculated at init time. This stores and restores LAP's lag in frames to - // prevent override by new cfg. - int lap_lag_in_frames = -1; - if (cpi->lap_enabled && cpi->compressor_stage == LAP_STAGE) { - lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames; - } if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile; seq_params->bit_depth = oxcf->tool_cfg.bit_depth; @@ -632,7 +625,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { // set the decoder model parameters in schedule mode seq_params->decoder_model_info.num_units_in_decoding_tick = dec_model_cfg->num_units_in_decoding_tick; - cm->buffer_removal_time_present = 1; + ppi->buffer_removal_time_present = 1; av1_set_aom_dec_model_info(&seq_params->decoder_model_info); av1_set_dec_model_op_parameters(&seq_params->op_params[0]); } else if (seq_params->timing_info_present && @@ -645,6 +638,56 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { 10; // Default value (not signaled) } + av1_update_film_grain_parameters_seq(ppi, oxcf); + + int sb_size = seq_params->sb_size; + // Superblock size should not be updated after the first key frame. + if (!ppi->seq_params_locked) { + set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width, + frm_dim_cfg->height, + ppi->number_spatial_layers)); + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) + seq_params->tier[i] = (oxcf->tier_mask >> i) & 1; + } + if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size) + *is_sb_size_changed = true; + + // Init sequence level coding tools + // This should not be called after the first key frame. + if (!ppi->seq_params_locked) { + seq_params->operating_points_cnt_minus_1 = + (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) + ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1 + : 0; + av1_init_seq_coding_tools(ppi, oxcf, ppi->use_svc); + } + +#if CONFIG_AV1_HIGHBITDEPTH + highbd_set_var_fns(ppi); +#endif +} + +void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf, + bool is_sb_size_changed) { + AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + MACROBLOCK *const x = &cpi->td.mb; + AV1LevelParams *const level_params = &cpi->ppi->level_params; + InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; + RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; + const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg; + const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; + + // in case of LAP, lag in frames is set according to number of lap buffers + // calculated at init time. This stores and restores LAP's lag in frames to + // prevent override by new cfg. + int lap_lag_in_frames = -1; + if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) { + lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames; + } + av1_update_film_grain_parameters(cpi, oxcf); cpi->oxcf = *oxcf; @@ -680,10 +723,10 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { seq_params->tier[0]); } - if ((has_no_stats_stage(cpi)) && (rc_cfg->mode == AOM_Q)) { - rc->baseline_gf_interval = FIXED_GF_INTERVAL; + if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) { + p_rc->baseline_gf_interval = FIXED_GF_INTERVAL; } else { - rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; + p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2; } refresh_frame_flags->golden_frame = false; @@ -720,16 +763,23 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { } } + if (x->pixel_gradient_info == NULL) { + const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome; + CHECK_MEM_ERROR(cm, x->pixel_gradient_info, + aom_malloc(sizeof(*x->pixel_gradient_info) * plane_types * + MAX_SB_SQUARE)); + } + av1_reset_segment_features(cm); av1_set_high_precision_mv(cpi, 1, 0); - set_rc_buffer_sizes(rc, rc_cfg); + set_rc_buffer_sizes(cpi); // Under a configuration change, where maximum_buffer_size may change, // keep buffer level clipped to the maximum allowed buffer size. - rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size); - rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size); + rc->bits_off_target = AOMMIN(rc->bits_off_target, p_rc->maximum_buffer_size); + rc->buffer_level = AOMMIN(rc->buffer_level, p_rc->maximum_buffer_size); // Set up frame rate and related parameters rate control values. av1_new_framerate(cpi, cpi->framerate); @@ -752,18 +802,9 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cm->width = frm_dim_cfg->width; cm->height = frm_dim_cfg->height; - int sb_size = seq_params->sb_size; - // Superblock size should not be updated after the first key frame. - if (!cpi->seq_params_locked) { - set_sb_size(&cm->seq_params, av1_select_sb_size(cpi)); - for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) - seq_params->tier[i] = (oxcf->tier_mask >> i) & 1; - } - - if (initial_dimensions->width || sb_size != seq_params->sb_size) { + if (initial_dimensions->width || is_sb_size_changed) { if (cm->width > initial_dimensions->width || - cm->height > initial_dimensions->height || - seq_params->sb_size != sb_size) { + cm->height > initial_dimensions->height || is_sb_size_changed) { av1_free_context_buffers(cm); av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); av1_free_sms_tree(&cpi->td); @@ -780,27 +821,15 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { set_tile_info(cm, &cpi->oxcf.tile_cfg); - if (!cpi->svc.external_ref_frame_config) + if (!cpi->svc.set_ref_frame_config) cpi->ext_flags.refresh_frame.update_pending = 0; cpi->ext_flags.refresh_frame_context_pending = 0; -#if CONFIG_AV1_HIGHBITDEPTH - highbd_set_var_fns(cpi); -#endif - - // Init sequence level coding tools - // This should not be called after the first key frame. - if (!cpi->seq_params_locked) { - seq_params->operating_points_cnt_minus_1 = - (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1) - ? cm->number_spatial_layers * cm->number_temporal_layers - 1 - : 0; - av1_init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc); - } - - if (cpi->use_svc) + if (cpi->ppi->use_svc) av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth); + check_reset_rc_flag(cpi); + // restore the value of lag_in_frame for LAP stage. if (lap_lag_in_frames != -1) { cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; @@ -810,7 +839,7 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { static INLINE void init_frame_info(FRAME_INFO *frame_info, const AV1_COMMON *const cm) { const CommonModeInfoParams *const mi_params = &cm->mi_params; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; frame_info->frame_width = cm->width; frame_info->frame_height = cm->height; frame_info->mi_cols = mi_params->mi_cols; @@ -834,73 +863,44 @@ static INLINE void update_frame_index_set(FRAME_INDEX_SET *frame_index_set, } } -AV1_PRIMARY *av1_create_primary_compressor() { +AV1_PRIMARY *av1_create_primary_compressor( + struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers, + AV1EncoderConfig *oxcf) { AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY)); if (!ppi) return NULL; av1_zero(*ppi); - return ppi; -} - -AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, - BufferPool *const pool, - FIRSTPASS_STATS *frame_stats_buf, - COMPRESSOR_STAGE stage, int num_lap_buffers, - int lap_lag_in_frames, - STATS_BUFFER_CTX *stats_buf_context) { - AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP)); - AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; - - if (!cm) return NULL; - - av1_zero(*cpi); - - cpi->ppi = ppi; - // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 // before it returns. - if (setjmp(cm->error.jmp)) { - cm->error.setjmp = 0; - av1_remove_compressor(cpi); + if (setjmp(ppi->error.jmp)) { + ppi->error.setjmp = 0; + av1_remove_primary_compressor(ppi); return 0; } + ppi->error.setjmp = 1; - cm->error.setjmp = 1; - cpi->lap_enabled = num_lap_buffers > 0; - cpi->compressor_stage = stage; - - CommonModeInfoParams *const mi_params = &cm->mi_params; - mi_params->free_mi = enc_free_mi; - mi_params->setup_mi = enc_setup_mi; - mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE) - ? stat_stage_set_mb_mi - : enc_set_mb_mi; - - mi_params->mi_alloc_bsize = BLOCK_4X4; - - CHECK_MEM_ERROR(cm, cm->fc, - (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); - CHECK_MEM_ERROR( - cm, cm->default_frame_context, - (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); - memset(cm->fc, 0, sizeof(*cm->fc)); - memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); - - cpi->common.buffer_pool = pool; + ppi->seq_params_locked = 0; + ppi->lap_enabled = num_lap_buffers > 0; + ppi->output_pkt_list = pkt_list_head; + ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS; + ppi->frames_left = oxcf->input_cfg.limit; +#if CONFIG_FRAME_PARALLEL_ENCODE + ppi->max_mv_magnitude = 0; + ppi->num_fp_contexts = 1; +#endif - init_config(cpi, oxcf); - if (cpi->compressor_stage == LAP_STAGE) { - cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; - } + init_config_sequence(ppi, oxcf); - cpi->frames_left = cpi->oxcf.input_cfg.limit; +#if CONFIG_ENTROPY_STATS + av1_zero(ppi->aggregate_fc); +#endif // CONFIG_ENTROPY_STATS - av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); + av1_primary_rc_init(oxcf, &ppi->p_rc); // For two pass and lag_in_frames > 33 in LAP. - cpi->rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2; - if (cpi->lap_enabled) { + ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2; + if (ppi->lap_enabled) { if ((num_lap_buffers < (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) && num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) { @@ -908,219 +908,22 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, * For lag in frames >= 19 and <33, enable scenecut * with limited future frame prediction. */ - cpi->rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1; + ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1; } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) { // Disable scenecut when lag_in_frames < 19. - cpi->rc.enable_scenecut_detection = DISABLE_SCENECUT; + ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT; } } - init_frame_info(&cpi->frame_info, cm); - init_frame_index_set(&cpi->frame_index_set); - - cm->current_frame.frame_number = 0; - cm->current_frame_id = -1; - cpi->seq_params_locked = 0; - cpi->partition_search_skippable_frame = 0; - cpi->tile_data = NULL; - cpi->last_show_frame_buf = NULL; - realloc_segmentation_maps(cpi); - - cpi->refresh_frame.alt_ref_frame = false; - - cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; -#if CONFIG_INTERNAL_STATS - cpi->b_calculate_blockiness = 1; - cpi->b_calculate_consistency = 1; - cpi->total_inconsistency = 0; - cpi->psnr[0].worst = 100.0; - cpi->psnr[1].worst = 100.0; - cpi->worst_ssim = 100.0; - cpi->worst_ssim_hbd = 100.0; - - cpi->count[0] = 0; - cpi->count[1] = 0; - cpi->bytes = 0; -#if CONFIG_SPEED_STATS - cpi->tx_search_count = 0; -#endif // CONFIG_SPEED_STATS - - if (cpi->b_calculate_psnr) { - cpi->total_sq_error[0] = 0; - cpi->total_samples[0] = 0; - cpi->total_sq_error[1] = 0; - cpi->total_samples[1] = 0; - cpi->tot_recode_hits = 0; - cpi->summed_quality = 0; - cpi->summed_weights = 0; - cpi->summed_quality_hbd = 0; - cpi->summed_weights_hbd = 0; - } - - cpi->fastssim.worst = 100.0; - cpi->psnrhvs.worst = 100.0; - - if (cpi->b_calculate_blockiness) { - cpi->total_blockiness = 0; - cpi->worst_blockiness = 0.0; - } - - if (cpi->b_calculate_consistency) { - CHECK_MEM_ERROR( - cm, cpi->ssim_vars, - aom_malloc(sizeof(*cpi->ssim_vars) * 4 * cpi->common.mi_params.mi_rows * - cpi->common.mi_params.mi_cols)); - cpi->worst_consistency = 100.0; - } -#endif -#if CONFIG_ENTROPY_STATS - av1_zero(aggregate_fc); -#endif // CONFIG_ENTROPY_STATS - - cpi->time_stamps.first_ts_start = INT64_MAX; - -#ifdef OUTPUT_YUV_REC - yuv_rec_file = fopen("rec.yuv", "wb"); -#endif -#ifdef OUTPUT_YUV_DENOISED - yuv_denoised_file = fopen("denoised.yuv", "wb"); -#endif - - assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS); - int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS); - for (int i = 0; i < size; i++) - cpi->twopass.frame_stats_arr[i] = &frame_stats_buf[i]; - - cpi->twopass.stats_buf_ctx = stats_buf_context; - cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start; - -#if !CONFIG_REALTIME_ONLY - if (is_stat_consumption_stage(cpi)) { - const size_t packet_sz = sizeof(FIRSTPASS_STATS); - const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz); - - if (!cpi->lap_enabled) { - /*Re-initialize to stats buffer, populated by application in the case of - * two pass*/ - cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->twopass_stats_in.buf; - cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start; - cpi->twopass.stats_buf_ctx->stats_in_end = - &cpi->twopass.stats_buf_ctx->stats_in_start[packets - 1]; - - av1_init_second_pass(cpi); - } else { - av1_init_single_pass_lap(cpi); - } - } -#endif - - alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm); - - CHECK_MEM_ERROR( - cm, cpi->td.mb.inter_modes_info, - (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info))); - - for (int x = 0; x < 2; x++) - for (int y = 0; y < 2; y++) - CHECK_MEM_ERROR( - cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], - (uint32_t *)aom_malloc( - AOM_BUFFER_SIZE_FOR_BLOCK_HASH * - sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0]))); - - cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0; - - av1_set_speed_features_framesize_independent(cpi, oxcf->speed); - av1_set_speed_features_framesize_dependent(cpi, oxcf->speed); - - CHECK_MEM_ERROR(cm, cpi->consec_zero_mv, - aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2, - sizeof(*cpi->consec_zero_mv))); - - { - const int bsize = BLOCK_16X16; - const int w = mi_size_wide[bsize]; - const int h = mi_size_high[bsize]; - const int num_cols = (mi_params->mi_cols + w - 1) / w; - const int num_rows = (mi_params->mi_rows + h - 1) / h; - CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors, - aom_calloc(num_rows * num_cols, - sizeof(*cpi->tpl_rdmult_scaling_factors))); - CHECK_MEM_ERROR(cm, cpi->tpl_sb_rdmult_scaling_factors, - aom_calloc(num_rows * num_cols, - sizeof(*cpi->tpl_sb_rdmult_scaling_factors))); - } - - { - const int bsize = BLOCK_16X16; - const int w = mi_size_wide[bsize]; - const int h = mi_size_high[bsize]; - const int num_cols = (mi_params->mi_cols + w - 1) / w; - const int num_rows = (mi_params->mi_rows + h - 1) / h; - CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors, - aom_calloc(num_rows * num_cols, - sizeof(*cpi->ssim_rdmult_scaling_factors))); - } - -#if CONFIG_TUNE_VMAF - { - const int bsize = BLOCK_64X64; - const int w = mi_size_wide[bsize]; - const int h = mi_size_high[bsize]; - const int num_cols = (mi_params->mi_cols + w - 1) / w; - const int num_rows = (mi_params->mi_rows + h - 1) / h; - CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors, - aom_calloc(num_rows * num_cols, - sizeof(*cpi->vmaf_info.rdmult_scaling_factors))); - for (int i = 0; i < MAX_ARF_LAYERS; i++) { - cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0; - cpi->vmaf_info.last_frame_ysse[i] = -1.0; - cpi->vmaf_info.last_frame_vmaf[i] = -1.0; - } - cpi->vmaf_info.original_qindex = -1; - -#if CONFIG_USE_VMAF_RC - cpi->vmaf_info.vmaf_model = NULL; -#endif - } -#endif - -#if CONFIG_TUNE_BUTTERAUGLI - { - const int w = mi_size_wide[butteraugli_rdo_bsize]; - const int h = mi_size_high[butteraugli_rdo_bsize]; - const int num_cols = (mi_params->mi_cols + w - 1) / w; - const int num_rows = (mi_params->mi_rows + h - 1) / h; - CHECK_MEM_ERROR( - cm, cpi->butteraugli_info.rdmult_scaling_factors, - aom_malloc(num_rows * num_cols * - sizeof(*cpi->butteraugli_info.rdmult_scaling_factors))); - memset(&cpi->butteraugli_info.source, 0, - sizeof(cpi->butteraugli_info.source)); - memset(&cpi->butteraugli_info.resized_source, 0, - sizeof(cpi->butteraugli_info.resized_source)); - cpi->butteraugli_info.recon_set = false; - } -#endif - -#if !CONFIG_REALTIME_ONLY - if (!is_stat_generation_stage(cpi)) { - av1_setup_tpl_buffers(cm, &cpi->tpl_data, cpi->oxcf.gf_cfg.lag_in_frames); - } -#endif - -#if CONFIG_COLLECT_PARTITION_STATS - av1_zero(cpi->partition_stats); -#endif // CONFIG_COLLECT_PARTITION_STATS #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; \ - cpi->fn_ptr[BT].jsdaf = JSDAF; \ - cpi->fn_ptr[BT].jsvaf = JSVAF; + ppi->fn_ptr[BT].sdf = SDF; \ + ppi->fn_ptr[BT].sdaf = SDAF; \ + ppi->fn_ptr[BT].vf = VF; \ + ppi->fn_ptr[BT].svf = SVF; \ + ppi->fn_ptr[BT].svaf = SVAF; \ + ppi->fn_ptr[BT].sdx4df = SDX4DF; \ + ppi->fn_ptr[BT].jsdaf = JSDAF; \ + ppi->fn_ptr[BT].jsvaf = JSVAF; // Realtime mode doesn't use 4x rectangular blocks. #if !CONFIG_REALTIME_ONLY @@ -1233,9 +1036,9 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, #if !CONFIG_REALTIME_ONLY #define OBFP(BT, OSDF, OVF, OSVF) \ - cpi->fn_ptr[BT].osdf = OSDF; \ - cpi->fn_ptr[BT].ovf = OVF; \ - cpi->fn_ptr[BT].osvf = OSVF; + ppi->fn_ptr[BT].osdf = OSDF; \ + ppi->fn_ptr[BT].ovf = OVF; \ + ppi->fn_ptr[BT].osvf = OSVF; OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128, aom_obmc_sub_pixel_variance128x128) @@ -1284,8 +1087,8 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, #endif // !CONFIG_REALTIME_ONLY #define MBFP(BT, MCSDF, MCSVF) \ - cpi->fn_ptr[BT].msdf = MCSDF; \ - cpi->fn_ptr[BT].msvf = MCSVF; + ppi->fn_ptr[BT].msdf = MCSDF; \ + ppi->fn_ptr[BT].msvf = MCSVF; MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_sub_pixel_variance128x128) @@ -1315,8 +1118,8 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, #endif #define SDSFP(BT, SDSF, SDSX4DF) \ - cpi->fn_ptr[BT].sdsf = SDSF; \ - cpi->fn_ptr[BT].sdsx4df = SDSX4DF; + ppi->fn_ptr[BT].sdsf = SDSF; \ + ppi->fn_ptr[BT].sdsx4df = SDSX4DF; SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d); SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d); @@ -1346,16 +1149,281 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, #undef SDSFP #if CONFIG_AV1_HIGHBITDEPTH - highbd_set_var_fns(cpi); + highbd_set_var_fns(ppi); +#endif + + { + // As cm->mi_params is a part of the frame level context (cpi), it is + // unavailable at this point. mi_params is created as a local temporary + // variable, to be passed into the functions used for allocating tpl + // buffers. The values in this variable are populated according to initial + // width and height of the frame. + CommonModeInfoParams mi_params; + enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height); + + const int bsize = BLOCK_16X16; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params.mi_cols + w - 1) / w; + const int num_rows = (mi_params.mi_rows + h - 1) / h; + AOM_CHECK_MEM_ERROR(&ppi->error, ppi->tpl_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*ppi->tpl_rdmult_scaling_factors))); + AOM_CHECK_MEM_ERROR( + &ppi->error, ppi->tpl_sb_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*ppi->tpl_sb_rdmult_scaling_factors))); + +#if !CONFIG_REALTIME_ONLY + if (oxcf->pass != 1) { + av1_setup_tpl_buffers(ppi, &mi_params, oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, 0, + oxcf->gf_cfg.lag_in_frames); + } +#endif + +#if CONFIG_INTERNAL_STATS + ppi->b_calculate_blockiness = 1; + ppi->b_calculate_consistency = 1; + + for (int i = 0; i <= STAT_ALL; i++) { + ppi->psnr[0].stat[i] = 0; + ppi->psnr[1].stat[i] = 0; + + ppi->fastssim.stat[i] = 0; + ppi->psnrhvs.stat[i] = 0; + } + + ppi->psnr[0].worst = 100.0; + ppi->psnr[1].worst = 100.0; + ppi->worst_ssim = 100.0; + ppi->worst_ssim_hbd = 100.0; + + ppi->count[0] = 0; + ppi->count[1] = 0; + ppi->total_bytes = 0; + + if (ppi->b_calculate_psnr) { + ppi->total_sq_error[0] = 0; + ppi->total_samples[0] = 0; + ppi->total_sq_error[1] = 0; + ppi->total_samples[1] = 0; + ppi->total_recode_hits = 0; + ppi->summed_quality = 0; + ppi->summed_weights = 0; + ppi->summed_quality_hbd = 0; + ppi->summed_weights_hbd = 0; + } + + ppi->fastssim.worst = 100.0; + ppi->psnrhvs.worst = 100.0; + + if (ppi->b_calculate_blockiness) { + ppi->total_blockiness = 0; + ppi->worst_blockiness = 0.0; + } + + ppi->total_inconsistency = 0; + ppi->worst_consistency = 100.0; + if (ppi->b_calculate_consistency) { + AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars, + aom_malloc(sizeof(*ppi->ssim_vars) * 4 * + mi_params.mi_rows * mi_params.mi_cols)); + } +#endif + } + + ppi->error.setjmp = 0; + return ppi; +} + +AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, + BufferPool *const pool, COMPRESSOR_STAGE stage, + int lap_lag_in_frames) { + AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP)); + AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; + + if (!cm) return NULL; + + av1_zero(*cpi); + + cpi->ppi = ppi; + cm->seq_params = &ppi->seq_params; + cm->error = &ppi->error; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + av1_remove_compressor(cpi); + return 0; + } + + cm->error->setjmp = 1; + cpi->compressor_stage = stage; + +#if CONFIG_FRAME_PARALLEL_ENCODE + cpi->do_frame_data_update = true; +#endif + + CommonModeInfoParams *const mi_params = &cm->mi_params; + mi_params->free_mi = enc_free_mi; + mi_params->setup_mi = enc_setup_mi; + mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE) + ? stat_stage_set_mb_mi + : enc_set_mb_mi; + + mi_params->mi_alloc_bsize = BLOCK_4X4; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc))); + CHECK_MEM_ERROR( + cm, cm->default_frame_context, + (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context))); + memset(cm->fc, 0, sizeof(*cm->fc)); + memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context)); + + cpi->common.buffer_pool = pool; + + init_config(cpi, oxcf); + if (cpi->compressor_stage == LAP_STAGE) { + cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames; + } + + av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc, &cpi->ppi->p_rc); + + init_frame_info(&cpi->frame_info, cm); + init_frame_index_set(&cpi->frame_index_set); + + cm->current_frame.frame_number = 0; + cm->current_frame_id = -1; + cpi->partition_search_skippable_frame = 0; + cpi->tile_data = NULL; + cpi->last_show_frame_buf = NULL; + realloc_segmentation_maps(cpi); + + cpi->refresh_frame.alt_ref_frame = false; + +#if CONFIG_SPEED_STATS + cpi->tx_search_count = 0; +#endif // CONFIG_SPEED_STATS + + cpi->time_stamps.first_ts_start = INT64_MAX; + +#ifdef OUTPUT_YUV_REC + yuv_rec_file = fopen("rec.yuv", "wb"); +#endif +#ifdef OUTPUT_YUV_DENOISED + yuv_denoised_file = fopen("denoised.yuv", "wb"); +#endif + +#if !CONFIG_REALTIME_ONLY + if (is_stat_consumption_stage(cpi)) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz); + + if (!cpi->ppi->lap_enabled) { + /*Re-initialize to stats buffer, populated by application in the case of + * two pass*/ + cpi->ppi->twopass.stats_buf_ctx->stats_in_start = + oxcf->twopass_stats_in.buf; + cpi->ppi->twopass.stats_in = + cpi->ppi->twopass.stats_buf_ctx->stats_in_start; + cpi->ppi->twopass.stats_buf_ctx->stats_in_end = + &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1]; + + av1_init_second_pass(cpi); + } else { + av1_init_single_pass_lap(cpi); + } + } +#endif + + alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm); + + CHECK_MEM_ERROR( + cm, cpi->td.mb.inter_modes_info, + (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info))); + + for (int x = 0; x < 2; x++) + for (int y = 0; y < 2; y++) + CHECK_MEM_ERROR( + cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y], + (uint32_t *)aom_malloc( + AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0]))); + + cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0; + + av1_set_speed_features_framesize_independent(cpi, oxcf->speed); + av1_set_speed_features_framesize_dependent(cpi, oxcf->speed); + + CHECK_MEM_ERROR(cm, cpi->consec_zero_mv, + aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2, + sizeof(*cpi->consec_zero_mv))); + + { + const int bsize = BLOCK_16X16; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->ssim_rdmult_scaling_factors))); + } + +#if CONFIG_TUNE_VMAF + { + const int bsize = BLOCK_64X64; + const int w = mi_size_wide[bsize]; + const int h = mi_size_high[bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors, + aom_calloc(num_rows * num_cols, + sizeof(*cpi->vmaf_info.rdmult_scaling_factors))); + for (int i = 0; i < MAX_ARF_LAYERS; i++) { + cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0; + cpi->vmaf_info.last_frame_ysse[i] = -1.0; + cpi->vmaf_info.last_frame_vmaf[i] = -1.0; + } + cpi->vmaf_info.original_qindex = -1; + cpi->vmaf_info.vmaf_model = NULL; + } +#endif + +#if CONFIG_TUNE_BUTTERAUGLI + { + const int w = mi_size_wide[butteraugli_rdo_bsize]; + const int h = mi_size_high[butteraugli_rdo_bsize]; + const int num_cols = (mi_params->mi_cols + w - 1) / w; + const int num_rows = (mi_params->mi_rows + h - 1) / h; + CHECK_MEM_ERROR( + cm, cpi->butteraugli_info.rdmult_scaling_factors, + aom_malloc(num_rows * num_cols * + sizeof(*cpi->butteraugli_info.rdmult_scaling_factors))); + memset(&cpi->butteraugli_info.source, 0, + sizeof(cpi->butteraugli_info.source)); + memset(&cpi->butteraugli_info.resized_source, 0, + sizeof(cpi->butteraugli_info.resized_source)); + cpi->butteraugli_info.recon_set = false; + } #endif +#if CONFIG_COLLECT_PARTITION_STATS + av1_zero(cpi->partition_stats); +#endif // CONFIG_COLLECT_PARTITION_STATS + /* av1_init_quantizer() is first called here. Add check in * av1_frame_init_quantizer() so that av1_init_quantizer is only * called later when needed. This will avoid unnecessary calls of * av1_init_quantizer() for every frame. */ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); av1_qm_init(&cm->quant_params, av1_num_planes(cm)); av1_loop_filter_init(cm); @@ -1365,7 +1433,7 @@ AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, #if !CONFIG_REALTIME_ONLY av1_loop_restoration_precal(); #endif - cm->error.setjmp = 0; + cm->error->setjmp = 0; return cpi; } @@ -1402,6 +1470,7 @@ static AOM_INLINE void free_thread_data(AV1_COMP *cpi) { for (int j = 0; j < 2; ++j) { aom_free(thread_data->td->tmp_pred_bufs[j]); } + aom_free(thread_data->td->pixel_gradient_info); release_obmc_buffers(&thread_data->td->obmc_buffer); aom_free(thread_data->td->vt64x64); @@ -1423,7 +1492,27 @@ static AOM_INLINE void free_thread_data(AV1_COMP *cpi) { void av1_remove_primary_compressor(AV1_PRIMARY *ppi) { if (!ppi) return; + aom_free_frame_buffer(&ppi->alt_ref_buffer); + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + aom_free(ppi->level_params.level_info[i]); + } av1_lookahead_destroy(ppi->lookahead); + + aom_free(ppi->tpl_rdmult_scaling_factors); + ppi->tpl_rdmult_scaling_factors = NULL; + aom_free(ppi->tpl_sb_rdmult_scaling_factors); + ppi->tpl_sb_rdmult_scaling_factors = NULL; + + TplParams *const tpl_data = &ppi->tpl_data; + for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { + aom_free(tpl_data->tpl_stats_pool[frame]); + aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]); + } + +#if !CONFIG_REALTIME_ONLY + av1_tpl_dealloc(&tpl_data->tpl_mt_sync); +#endif + aom_free(ppi); } @@ -1432,127 +1521,6 @@ void av1_remove_compressor(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; if (cm->current_frame.frame_number > 0) { -#if CONFIG_ENTROPY_STATS - if (!is_stat_generation_stage(cpi)) { - fprintf(stderr, "Writing counts.stt\n"); - FILE *f = fopen("counts.stt", "wb"); - fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f); - fclose(f); - } -#endif // CONFIG_ENTROPY_STATS -#if CONFIG_INTERNAL_STATS - aom_clear_system_state(); - - if (!is_stat_generation_stage(cpi)) { - char headings[512] = { 0 }; - char results[512] = { 0 }; - FILE *f = fopen("opsnr.stt", "a"); - double time_encoded = - (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) / - 10000000.000; - double total_encode_time = - (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; - const double dr = - (double)cpi->bytes * (double)8 / (double)1000 / time_encoded; - const double peak = - (double)((1 << cpi->oxcf.input_cfg.input_bit_depth) - 1); - const double target_rate = - (double)cpi->oxcf.rc_cfg.target_bandwidth / 1000; - const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); - - if (cpi->b_calculate_psnr) { - const double total_psnr = - aom_sse_to_psnr((double)cpi->total_samples[0], peak, - (double)cpi->total_sq_error[0]); - const double total_ssim = - 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); - snprintf(headings, sizeof(headings), - "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" - "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" - "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" - "AVPsrnY\tAPsnrCb\tAPsnrCr"); - snprintf(results, sizeof(results), - "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" - "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" - "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" - "%7.3f\t%7.3f\t%7.3f", - dr, cpi->psnr[0].stat[STAT_ALL] / cpi->count[0], total_psnr, - cpi->psnr[0].stat[STAT_ALL] / cpi->count[0], total_psnr, - total_ssim, total_ssim, - cpi->fastssim.stat[STAT_ALL] / cpi->count[0], - cpi->psnrhvs.stat[STAT_ALL] / cpi->count[0], - cpi->psnr[0].worst, cpi->worst_ssim, cpi->fastssim.worst, - cpi->psnrhvs.worst, cpi->psnr[0].stat[STAT_Y] / cpi->count[0], - cpi->psnr[0].stat[STAT_U] / cpi->count[0], - cpi->psnr[0].stat[STAT_V] / cpi->count[0]); - - if (cpi->b_calculate_blockiness) { - SNPRINT(headings, "\t Block\tWstBlck"); - SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count[0]); - SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness); - } - - if (cpi->b_calculate_consistency) { - double consistency = - aom_sse_to_psnr((double)cpi->total_samples[0], peak, - (double)cpi->total_inconsistency); - - SNPRINT(headings, "\tConsist\tWstCons"); - SNPRINT2(results, "\t%7.3f", consistency); - SNPRINT2(results, "\t%7.3f", cpi->worst_consistency); - } - - SNPRINT(headings, "\t Time\tRcErr\tAbsErr"); - SNPRINT2(results, "\t%8.0f", total_encode_time); - SNPRINT2(results, " %7.2f", rate_err); - SNPRINT2(results, " %7.2f", fabs(rate_err)); - - SNPRINT(headings, "\tAPsnr611"); - SNPRINT2(results, " %7.3f", - (6 * cpi->psnr[0].stat[STAT_Y] + cpi->psnr[0].stat[STAT_U] + - cpi->psnr[0].stat[STAT_V]) / - (cpi->count[0] * 8)); - -#if CONFIG_AV1_HIGHBITDEPTH - const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; - const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; - if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) && - (in_bit_depth < bit_depth)) { - const double peak_hbd = (double)((1 << bit_depth) - 1); - const double total_psnr_hbd = - aom_sse_to_psnr((double)cpi->total_samples[1], peak_hbd, - (double)cpi->total_sq_error[1]); - const double total_ssim_hbd = - 100 * pow(cpi->summed_quality_hbd / cpi->summed_weights_hbd, 8.0); - SNPRINT(headings, - "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH" - " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH" - " AOMSSIMH VPSSIMPH WstSsimH"); - SNPRINT2(results, "\t%7.3f", - cpi->psnr[1].stat[STAT_ALL] / cpi->count[1]); - SNPRINT2(results, " %7.3f", total_psnr_hbd); - SNPRINT2(results, " %7.3f", - cpi->psnr[1].stat[STAT_ALL] / cpi->count[1]); - SNPRINT2(results, " %7.3f", total_psnr_hbd); - SNPRINT2(results, " %7.3f", - cpi->psnr[1].stat[STAT_Y] / cpi->count[1]); - SNPRINT2(results, " %7.3f", - cpi->psnr[1].stat[STAT_U] / cpi->count[1]); - SNPRINT2(results, " %7.3f", - cpi->psnr[1].stat[STAT_V] / cpi->count[1]); - SNPRINT2(results, " %7.3f", cpi->psnr[1].worst); - SNPRINT2(results, " %7.3f", total_ssim_hbd); - SNPRINT2(results, " %7.3f", total_ssim_hbd); - SNPRINT2(results, " %7.3f", cpi->worst_ssim_hbd); - } -#endif - fprintf(f, "%s\n", headings); - fprintf(f, "%s\n", results); - } - - fclose(f); - } -#endif // CONFIG_INTERNAL_STATS #if CONFIG_SPEED_STATS if (!is_stat_generation_stage(cpi)) { fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count); @@ -1571,12 +1539,6 @@ void av1_remove_compressor(AV1_COMP *cpi) { av1_denoiser_free(&(cpi->denoiser)); #endif - TplParams *const tpl_data = &cpi->tpl_data; - for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { - aom_free(tpl_data->tpl_stats_pool[frame]); - aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]); - } - if (cpi->compressor_stage != LAP_STAGE) { terminate_worker_data(cpi); free_thread_data(cpi); @@ -1586,6 +1548,7 @@ void av1_remove_compressor(AV1_COMP *cpi) { #if CONFIG_MULTITHREAD pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_; pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_; + pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_; if (enc_row_mt_mutex_ != NULL) { pthread_mutex_destroy(enc_row_mt_mutex_); aom_free(enc_row_mt_mutex_); @@ -1594,6 +1557,10 @@ void av1_remove_compressor(AV1_COMP *cpi) { pthread_mutex_destroy(gm_mt_mutex_); aom_free(gm_mt_mutex_); } + if (pack_bs_mt_mutex_ != NULL) { + pthread_mutex_destroy(pack_bs_mt_mutex_); + aom_free(pack_bs_mt_mutex_); + } #endif av1_row_mt_mem_dealloc(cpi); if (cpi->compressor_stage != LAP_STAGE) { @@ -1601,9 +1568,6 @@ void av1_remove_compressor(AV1_COMP *cpi) { aom_free(mt_info->workers); } -#if !CONFIG_REALTIME_ONLY - av1_tpl_dealloc(&tpl_data->tpl_mt_sync); -#endif if (mt_info->num_workers > 1) { av1_loop_filter_dealloc(&mt_info->lf_row_sync); av1_cdef_mt_dealloc(&mt_info->cdef_sync); @@ -1617,13 +1581,9 @@ void av1_remove_compressor(AV1_COMP *cpi) { dealloc_compressor_data(cpi); -#if CONFIG_INTERNAL_STATS - aom_free(cpi->ssim_vars); - cpi->ssim_vars = NULL; -#endif // CONFIG_INTERNAL_STATS + av1_ext_part_delete(&cpi->ext_part_controller); av1_remove_common(cm); - av1_free_ref_frame_buffers(cm->buffer_pool); aom_free(cpi); @@ -1667,7 +1627,7 @@ static void generate_psnr_packet(AV1_COMP *cpi) { #endif pkt.kind = AOM_CODEC_PSNR_PKT; - aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt); + aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt); } int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) { @@ -1781,7 +1741,12 @@ static void set_mv_search_params(AV1_COMP *cpi) { mv_search_params->mv_step_param = av1_init_search_range( AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude)); } +#if CONFIG_FRAME_PARALLEL_ENCODE + // Reset max_mv_magnitude for parallel frames based on update flag. + if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1; +#else mv_search_params->max_mv_magnitude = -1; +#endif } } } @@ -1789,14 +1754,14 @@ static void set_mv_search_params(AV1_COMP *cpi) { void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) { const AV1_COMMON *const cm = &cpi->common; - if (cm->seq_params.force_screen_content_tools != 2) { + if (cm->seq_params->force_screen_content_tools != 2) { features->allow_screen_content_tools = features->allow_intrabc = - cm->seq_params.force_screen_content_tools; + cm->seq_params->force_screen_content_tools; return; } if (cpi->oxcf.mode == REALTIME) { - assert(cm->seq_params.reduced_still_picture_hdr); + assert(cm->seq_params->reduced_still_picture_hdr); features->allow_screen_content_tools = features->allow_intrabc = 0; return; } @@ -1814,7 +1779,7 @@ void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) { const int stride = cpi->unfiltered_source->y_stride; const int width = cpi->unfiltered_source->y_width; const int height = cpi->unfiltered_source->y_height; - const int bd = cm->seq_params.bit_depth; + const int bd = cm->seq_params->bit_depth; const int blk_w = 16; const int blk_h = 16; // These threshold values are selected experimentally. @@ -1960,7 +1925,7 @@ static void init_ref_frame_bufs(AV1_COMP *cpi) { void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, int subsampling_x, int subsampling_y) { AV1_COMMON *const cm = &cpi->common; - SequenceHeader *const seq_params = &cm->seq_params; + SequenceHeader *const seq_params = cm->seq_params; InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; if (!initial_dimensions->width || @@ -1994,11 +1959,11 @@ static void setup_denoiser_buffer(AV1_COMP *cpi) { if (cpi->oxcf.noise_sensitivity > 0 && !cpi->denoiser.frame_buffer_initialized) { if (av1_denoiser_alloc( - cm, &cpi->svc, &cpi->denoiser, cpi->use_svc, + cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc, cpi->oxcf.noise_sensitivity, cm->width, cm->height, - cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, - cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS)) + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate denoiser"); } } @@ -2008,9 +1973,9 @@ static void setup_denoiser_buffer(AV1_COMP *cpi) { int av1_set_size_literal(AV1_COMP *cpi, int width, int height) { AV1_COMMON *cm = &cpi->common; InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; - av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth, - cm->seq_params.subsampling_x, - cm->seq_params.subsampling_y); + av1_check_initial_width(cpi, cm->seq_params->use_highbitdepth, + cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y); if (width <= 0 || height <= 0) return 1; @@ -2040,7 +2005,7 @@ int av1_set_size_literal(AV1_COMP *cpi, int width, int height) { void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { AV1_COMMON *const cm = &cpi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; int ref_frame; @@ -2078,7 +2043,7 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows, cm->mi_params.mi_cols, av1_num_planes(cm))) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } @@ -2088,11 +2053,16 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, cpi->oxcf.tool_cfg.enable_global_motion)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); + if (!is_stat_generation_stage(cpi)) + av1_alloc_cdef_buffers(cm, &cpi->mt_info.cdef_worker, + &cpi->mt_info.cdef_sync, + cpi->mt_info.num_mod_workers[MOD_CDEF]); + #if !CONFIG_REALTIME_ONLY - const int use_restoration = cm->seq_params.enable_restoration && + const int use_restoration = cm->seq_params->enable_restoration && !cm->features.all_lossless && !cm->tiles.large_scale; if (use_restoration) { @@ -2107,6 +2077,7 @@ void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { av1_alloc_restoration_buffers(cm); } #endif + if (!is_stat_generation_stage(cpi)) alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); @@ -2145,13 +2116,22 @@ static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm, #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, cdef_time); #endif + const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF]; // Find CDEF parameters av1_cdef_search(&cpi->mt_info, &cm->cur_frame->buf, cpi->source, cm, xd, - cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult); + cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult, + cpi->sf.rt_sf.skip_cdef_sb, cpi->rc.frames_since_key); // Apply the filter - if (!cpi->sf.rt_sf.skip_loopfilter_non_reference) - av1_cdef_frame(&cm->cur_frame->buf, cm, xd); + if (!cpi->sf.rt_sf.skip_loopfilter_non_reference) { + if (num_workers > 1) { + av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker, + cpi->mt_info.workers, &cpi->mt_info.cdef_sync, + num_workers, av1_cdef_init_fb_row_mt); + } else { + av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row); + } + } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, cdef_time); #endif @@ -2211,11 +2191,19 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { const int use_loopfilter = !cm->features.coded_lossless && !cm->tiles.large_scale; - const int use_cdef = cm->seq_params.enable_cdef && + const int use_cdef = cm->seq_params->enable_cdef && !cm->features.coded_lossless && !cm->tiles.large_scale; - const int use_restoration = cm->seq_params.enable_restoration && + const int use_restoration = cm->seq_params->enable_restoration && !cm->features.all_lossless && !cm->tiles.large_scale; + const int cur_width = cm->cur_frame->width; + const int cur_height = cm->cur_frame->height; + const int cur_width_mib = cm->mi_params.mi_cols * MI_SIZE; + const int cur_height_mib = cm->mi_params.mi_rows * MI_SIZE; + const int is_realtime = + cpi->sf.rt_sf.use_nonrd_pick_mode && !(cm->mi_params.mi_cols % 2) && + !(cm->mi_params.mi_rows % 2) && (cur_width_mib - cur_width < MI_SIZE) && + (cur_height_mib - cur_height < MI_SIZE); struct loopfilter *lf = &cm->lf; @@ -2238,13 +2226,13 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { 0, #endif mt_info->workers, num_workers, - &mt_info->lf_row_sync); + &mt_info->lf_row_sync, is_realtime); else av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd, #if CONFIG_LPF_MASK 0, #endif - 0, num_planes, 0); + 0, num_planes, 0, is_realtime); } #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, loop_filter_time); @@ -2278,16 +2266,17 @@ static int encode_without_recode(AV1_COMP *cpi) { int top_index = 0, bottom_index = 0, q = 0; YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source; InterpFilter filter_scaler = - cpi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id] - : EIGHTTAP_SMOOTH; - int phase_scaler = - cpi->use_svc ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0; + cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id] + : EIGHTTAP_SMOOTH; + int phase_scaler = cpi->ppi->use_svc + ? svc->downsample_filter_phase[svc->spatial_layer_id] + : 0; set_size_independent_vars(cpi); av1_setup_frame_size(cpi); av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); - if (!cpi->use_svc) { + if (!cpi->ppi->use_svc) { phase_scaler = 8; // 2:1 scaling. if ((cm->width << 1) == unscaled->y_crop_width && @@ -2315,6 +2304,12 @@ static int encode_without_recode(AV1_COMP *cpi) { printf("\n Encoding a frame:"); #endif +#if CONFIG_TUNE_BUTTERAUGLI + if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { + av1_setup_butteraugli_rdmult(cpi); + } +#endif + aom_clear_system_state(); cpi->source = av1_scale_if_required(cm, unscaled, &cpi->scaled_source, @@ -2336,7 +2331,7 @@ static int encode_without_recode(AV1_COMP *cpi) { } #if CONFIG_AV1_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc) + if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc) av1_denoiser_reset_on_first_frame(cpi); #endif @@ -2365,7 +2360,7 @@ static int encode_without_recode(AV1_COMP *cpi) { // (zero_mode is forced), and since the scaled references are only // use for newmv search, we can avoid scaling here. if (!frame_is_intra_only(cm) && - !(cpi->use_svc && cpi->svc.force_zero_mode_spatial_ref)) + !(cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref)) av1_scale_references(cpi, filter_scaler, phase_scaler, 1); av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q, @@ -2373,7 +2368,7 @@ static int encode_without_recode(AV1_COMP *cpi) { av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); if ((q_cfg->deltaq_mode != NO_DELTA_Q) || q_cfg->enable_chroma_deltaq) av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q, 0); av1_setup_frame(cpi); @@ -2388,7 +2383,7 @@ static int encode_without_recode(AV1_COMP *cpi) { av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed); if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq) av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q, 0); if (frame_is_intra_only(cm) || cm->features.error_resilient_mode) av1_setup_frame(cpi); @@ -2432,7 +2427,7 @@ static int encode_without_recode(AV1_COMP *cpi) { end_timing(cpi, av1_encode_frame_time); #endif #if CONFIG_INTERNAL_STATS - ++cpi->tot_recode_hits; + ++cpi->frame_recode_hits; #endif aom_clear_system_state(); @@ -2504,7 +2499,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { if (!cpi->sf.hl_sf.disable_extra_sc_testing) av1_determine_sc_tools_with_encoding(cpi, q); -#if CONFIG_USE_VMAF_RC +#if CONFIG_TUNE_VMAF if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source); } @@ -2525,6 +2520,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { do { loop = 0; + int do_mv_stats_collection = 1; aom_clear_system_state(); // if frame was scaled calculate global_motion_search again if already @@ -2580,7 +2576,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq) av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q, 0); @@ -2636,14 +2632,19 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { // transform / motion compensation build reconstruction frame av1_encode_frame(cpi); +#if CONFIG_FRAME_PARALLEL_ENCODE + // Disable mv_stats collection for parallel frames based on update flag. + if (!cpi->do_frame_data_update) do_mv_stats_collection = 0; +#endif // CONFIG_FRAME_PARALLEL_ENCODE + // Reset the mv_stats in case we are interrupted by an intraframe or an // overlay frame. - if (cpi->mv_stats.valid) { - av1_zero(cpi->mv_stats); + if (cpi->ppi->mv_stats.valid && do_mv_stats_collection) { + av1_zero(cpi->ppi->mv_stats); } // Gather the mv_stats for the next frame if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && - av1_frame_allows_smart_mv(cpi)) { + av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) { av1_collect_mv_stats(cpi, q); } @@ -2653,6 +2654,9 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { aom_clear_system_state(); +#if CONFIG_BITRATE_ACCURACY + const int do_dummy_pack = 1; +#else // CONFIG_BITRATE_ACCURACY // Dummy pack of the bitstream using up to date stats to get an // accurate estimate of output frame size to determine if we need // to recode. @@ -2660,6 +2664,7 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF && oxcf->rc_cfg.mode != AOM_Q) || oxcf->rc_cfg.min_cr > 0; +#endif // CONFIG_BITRATE_ACCURACY if (do_dummy_pack) { av1_finalize_encoded_frame(cpi); int largest_tile_id = 0; // Output from bitstream: unused here @@ -2669,7 +2674,15 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { return AOM_CODEC_ERROR; } + // bits used for this frame rc->projected_frame_size = (int)(*size) << 3; + +#if CONFIG_BITRATE_ACCURACY + cpi->ppi->tpl_data.actual_gop_bitrate += rc->projected_frame_size; + printf("\nframe: %d, projected frame size: %d, total: %f\n", + cpi->gf_frame_index, rc->projected_frame_size, + cpi->ppi->tpl_data.actual_gop_bitrate); +#endif } #if CONFIG_TUNE_VMAF @@ -2688,15 +2701,19 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { #if CONFIG_TUNE_BUTTERAUGLI if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { loop = 1; - av1_restore_butteraugli_source(cpi); + av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4); } #endif +#if CONFIG_BITRATE_ACCURACY + loop = 0; // turn off recode loop when CONFIG_BITRATE_ACCURACY is on +#endif // CONFIG_BITRATE_ACCURACY + if (loop) { ++loop_count; #if CONFIG_INTERNAL_STATS - ++cpi->tot_recode_hits; + ++cpi->frame_recode_hits; #endif } #if CONFIG_COLLECT_COMPONENT_TIMING @@ -2796,12 +2813,12 @@ static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size, #endif AV1_COMMON *const cm = &cpi->common; - SequenceHeader *const seq_params = &cm->seq_params; + SequenceHeader *const seq_params = cm->seq_params; // Special case code to reduce pulsing when key frames are forced at a // fixed interval. Note the reconstruction error if it is the frame before // the force key frame - if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { + if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { #if CONFIG_AV1_HIGHBITDEPTH if (seq_params->use_highbitdepth) { cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); @@ -2884,7 +2901,7 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, uint8_t *dest, int *largest_tile_id) { const AV1_COMMON *const cm = &cpi->common; - assert(cm->seq_params.enable_superres); + assert(cm->seq_params->enable_superres); assert(av1_superres_in_recode_allowed(cpi)); aom_codec_err_t err = AOM_CODEC_OK; av1_save_all_coding_context(cpi); @@ -2904,9 +2921,9 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, int64_t superres_rates[SCALE_NUMERATOR]; int superres_largest_tile_ids[SCALE_NUMERATOR]; // Use superres for Key-frames and Alt-ref frames only. - const GF_GROUP *const gf_group = &cpi->gf_group; - if (gf_group->update_type[gf_group->index] != OVERLAY_UPDATE && - gf_group->update_type[gf_group->index] != INTNL_OVERLAY_UPDATE) { + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] != OVERLAY_UPDATE && + gf_group->update_type[cpi->gf_frame_index] != INTNL_OVERLAY_UPDATE) { for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) { superres_cfg->superres_scale_denominator = denom; @@ -2952,7 +2969,7 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, const int64_t this_rate = superres_rates[this_index]; const int this_largest_tile_id = superres_largest_tile_ids[this_index]; const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST( - rdmult, this_rate, this_sse, cm->seq_params.bit_depth); + rdmult, this_rate, this_sse, cm->seq_params->bit_depth); if (this_rdcost < proj_rdcost1) { sse1 = this_sse; rate1 = this_rate; @@ -2962,7 +2979,7 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, } } const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST( - rdmult, rate2, sse2, cm->seq_params.bit_depth); + rdmult, rate2, sse2, cm->seq_params->bit_depth); // Re-encode with superres if it's better. if (proj_rdcost1 < proj_rdcost2) { restore_all_coding_context(cpi); @@ -3007,9 +3024,9 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex); proj_rdcost1 = RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST( - rdmult, rate2, sse2, cm->seq_params.bit_depth); + rdmult, rate2, sse2, cm->seq_params->bit_depth); // Re-encode with superres if it's better. if (proj_rdcost1 < proj_rdcost2) { restore_all_coding_context(cpi); @@ -3034,6 +3051,42 @@ static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size, return err; } +#if !CONFIG_REALTIME_ONLY +static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->ppi->twopass; + const FIRSTPASS_STATS *const total_stats = + twopass->stats_buf_ctx->total_stats; + + if (is_one_pass_rt_params(cpi) || + (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) || + (is_fp_wavelet_energy_invalid(total_stats) == 0)) + return; + + const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source; + const uint8_t *const src = unfiltered_source->y_buffer; + const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH; + const int stride = unfiltered_source->y_stride; + const BLOCK_SIZE fp_block_size = + get_fp_block_size(cpi->is_screen_content_type); + const int fp_block_size_width = block_size_wide[fp_block_size]; + const int fp_block_size_height = block_size_high[fp_block_size]; + const int num_unit_cols = + get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width); + const int num_unit_rows = + get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height); + const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8); + const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8); + int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input( + src, stride, hbd, num_8x8_rows, num_8x8_cols); + + twopass->frame_avg_haar_energy = + log(((double)frame_avg_wavelet_energy / num_mbs) + 1.0); +} +#endif + extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename); @@ -3055,7 +3108,7 @@ extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc, static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest) { AV1_COMMON *const cm = &cpi->common; - SequenceHeader *const seq_params = &cm->seq_params; + SequenceHeader *const seq_params = cm->seq_params; CurrentFrame *const current_frame = &cm->current_frame; const AV1EncoderConfig *const oxcf = &cpi->oxcf; struct segmentation *const seg = &cm->seg; @@ -3070,6 +3123,10 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, av1_set_screen_content_options(cpi, features); } +#if !CONFIG_REALTIME_ONLY + calculate_frame_avg_haar_energy(cpi); +#endif + // frame type has been decided outside of this function call cm->cur_frame->frame_type = current_frame->frame_type; @@ -3088,7 +3145,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, cpi->last_frame_type = current_frame->frame_type; if (frame_is_sframe(cm)) { - GF_GROUP *gf_group = &cpi->gf_group; + GF_GROUP *gf_group = &cpi->ppi->gf_group; // S frame will wipe out any previously encoded altref so we cannot place // an overlay frame gf_group->update_type[gf_group->size] = GF_UPDATE; @@ -3110,7 +3167,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, cm->ref_frame_id[i] = display_frame_id; } - cpi->seq_params_locked = 1; + cpi->ppi->seq_params_locked = 1; #if DUMP_RECON_FRAMES == 1 // NOTE(zoeliu): For debug - Output the filtered reconstructed video. @@ -3147,7 +3204,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, if (!is_stat_generation_stage(cpi) && cpi->common.features.allow_screen_content_tools && !frame_is_intra_only(cm)) { - if (cpi->common.seq_params.force_integer_mv == 2) { + if (cpi->common.seq_params->force_integer_mv == 2) { // Adaptive mode: see what previous frame encoded did if (cpi->unscaled_last_source != NULL) { features->cur_frame_force_integer_mv = av1_is_integer_mv( @@ -3157,7 +3214,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, } } else { cpi->common.features.cur_frame_force_integer_mv = - cpi->common.seq_params.force_integer_mv; + cpi->common.seq_params->force_integer_mv; } } else { cpi->common.features.cur_frame_force_integer_mv = 0; @@ -3290,7 +3347,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, cpi->superres_mode = orig_superres_mode; // restore } - cpi->seq_params_locked = 1; + cpi->ppi->seq_params_locked = 1; // Update reference frame ids for reference frames this frame will overwrite if (seq_params->frame_id_numbers_present_flag) { @@ -3332,10 +3389,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, refresh_reference_frames(cpi); -#if CONFIG_ENTROPY_STATS - av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts); -#endif // CONFIG_ENTROPY_STATS - if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { *cm->fc = cpi->tile_data[largest_tile_id].tctx; av1_reset_cdf_symbol_counters(cm->fc); @@ -3417,7 +3470,13 @@ int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, current_frame->display_order_hint = current_frame->order_hint; current_frame->order_hint %= - (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1)); + (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1)); + +#if CONFIG_FRAME_PARALLEL_ENCODE + current_frame->pyramid_level = get_true_pyr_level( + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], + current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth); +#endif // CONFIG_FRAME_PARALLEL_ENCODE if (is_stat_generation_stage(cpi)) { #if !CONFIG_REALTIME_ONLY @@ -3442,9 +3501,9 @@ static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd, AV1_COMMON *const cm = &cpi->common; if (!cpi->denoise_and_model) { cpi->denoise_and_model = aom_denoise_and_model_alloc( - cm->seq_params.bit_depth, block_size, noise_level); + cm->seq_params->bit_depth, block_size, noise_level); if (!cpi->denoise_and_model) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating denoise and model"); return -1; } @@ -3452,7 +3511,7 @@ static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd, if (!cpi->film_grain_table) { cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); if (!cpi->film_grain_table) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Error allocating grain table"); return -1; } @@ -3474,7 +3533,7 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; int res = 0; const int subsampling_x = sd->subsampling_x; const int subsampling_y = sd->subsampling_y; @@ -3516,7 +3575,7 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, res = -1; #if CONFIG_INTERNAL_STATS aom_usec_timer_mark(&timer); - cpi->time_receive_data += aom_usec_timer_elapsed(&timer); + cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer); #endif // Note: Regarding profile setting, the following checks are added to help @@ -3528,20 +3587,20 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, // header. if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && (subsampling_x != 1 || subsampling_y != 1)) { - aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, "Non-4:2:0 color format requires profile 1 or 2"); res = -1; } if ((seq_params->profile == PROFILE_1) && !(subsampling_x == 0 && subsampling_y == 0)) { - aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, "Profile 1 requires 4:4:4 color format"); res = -1; } if ((seq_params->profile == PROFILE_2) && (seq_params->bit_depth <= AOM_BITS_10) && !(subsampling_x == 1 && subsampling_y == 0)) { - aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, + aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, "Profile 2 bit-depth <= 10 requires 4:2:2 color format"); res = -1; } @@ -3549,6 +3608,20 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, return res; } +#if CONFIG_ENTROPY_STATS +void print_entropy_stats(AV1_PRIMARY *const ppi) { + if (!ppi->cpi) return; + + if (ppi->cpi->oxcf.pass != 1 && + ppi->cpi->common.current_frame.frame_number > 0) { + fprintf(stderr, "Writing counts.stt\n"); + FILE *f = fopen("counts.stt", "wb"); + fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f); + fclose(f); + } +} +#endif // CONFIG_ENTROPY_STATS + #if CONFIG_INTERNAL_STATS extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch, const unsigned char *img2, int img2_pitch, @@ -3564,11 +3637,16 @@ static void adjust_image_stat(double y, double u, double v, double all, } static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { + AV1_PRIMARY *const ppi = cpi->ppi; AV1_COMMON *const cm = &cpi->common; double samples = 0.0; const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; + if (cpi->ppi->use_svc && + cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) + return; + #if CONFIG_INTER_STATS_ONLY if (cm->current_frame.frame_type == KEY_FRAME) return; // skip key frame #endif @@ -3578,9 +3656,9 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; double y, u, v, frame_all; - cpi->count[0]++; - cpi->count[1]++; - if (cpi->b_calculate_psnr) { + ppi->count[0]++; + ppi->count[1]++; + if (cpi->ppi->b_calculate_psnr) { PSNR_STATS psnr; double weight[2] = { 0.0, 0.0 }; double frame_ssim2[2] = { 0.0, 0.0 }; @@ -3591,34 +3669,30 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { aom_calc_psnr(orig, recon, &psnr); #endif adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0], - &(cpi->psnr[0])); - cpi->total_sq_error[0] += psnr.sse[0]; - cpi->total_samples[0] += psnr.samples[0]; + &(ppi->psnr[0])); + ppi->total_sq_error[0] += psnr.sse[0]; + ppi->total_samples[0] += psnr.samples[0]; samples = psnr.samples[0]; - // TODO(yaowu): unify these two versions into one. - if (cm->seq_params.use_highbitdepth) - aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth, - frame_ssim2); - else - aom_calc_ssim(orig, recon, &weight[0], &frame_ssim2[0]); + aom_calc_ssim(orig, recon, bit_depth, in_bit_depth, + cm->seq_params->use_highbitdepth, weight, frame_ssim2); - cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2[0]); - cpi->summed_quality += frame_ssim2[0] * weight[0]; - cpi->summed_weights += weight[0]; + ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]); + ppi->summed_quality += frame_ssim2[0] * weight[0]; + ppi->summed_weights += weight[0]; #if CONFIG_AV1_HIGHBITDEPTH // Compute PSNR based on stream bit depth if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) { adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3], - psnr.psnr_hbd[0], &cpi->psnr[1]); - cpi->total_sq_error[1] += psnr.sse_hbd[0]; - cpi->total_samples[1] += psnr.samples_hbd[0]; + psnr.psnr_hbd[0], &ppi->psnr[1]); + ppi->total_sq_error[1] += psnr.sse_hbd[0]; + ppi->total_samples[1] += psnr.samples_hbd[0]; - cpi->worst_ssim_hbd = AOMMIN(cpi->worst_ssim_hbd, frame_ssim2[1]); - cpi->summed_quality_hbd += frame_ssim2[1] * weight[1]; - cpi->summed_weights_hbd += weight[1]; + ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]); + ppi->summed_quality_hbd += frame_ssim2[1] * weight[1]; + ppi->summed_weights_hbd += weight[1]; } #endif @@ -3636,48 +3710,207 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { } #endif } - if (cpi->b_calculate_blockiness) { - if (!cm->seq_params.use_highbitdepth) { + if (ppi->b_calculate_blockiness) { + if (!cm->seq_params->use_highbitdepth) { const double frame_blockiness = av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height); - cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness); - cpi->total_blockiness += frame_blockiness; + ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness); + ppi->total_blockiness += frame_blockiness; } - if (cpi->b_calculate_consistency) { - if (!cm->seq_params.use_highbitdepth) { + if (ppi->b_calculate_consistency) { + if (!cm->seq_params->use_highbitdepth) { const double this_inconsistency = aom_get_ssim_metrics( orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, - orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1); + orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1); const double peak = (double)((1 << in_bit_depth) - 1); const double consistency = - aom_sse_to_psnr(samples, peak, cpi->total_inconsistency); + aom_sse_to_psnr(samples, peak, ppi->total_inconsistency); if (consistency > 0.0) - cpi->worst_consistency = - AOMMIN(cpi->worst_consistency, consistency); - cpi->total_inconsistency += this_inconsistency; + ppi->worst_consistency = + AOMMIN(ppi->worst_consistency, consistency); + ppi->total_inconsistency += this_inconsistency; } } } frame_all = aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); - adjust_image_stat(y, u, v, frame_all, &cpi->fastssim); + adjust_image_stat(y, u, v, frame_all, &ppi->fastssim); frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth); - adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs); + adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs); + } +} + +void print_internal_stats(AV1_PRIMARY *const ppi) { + if (!ppi->cpi) return; + AV1_COMP *const cpi = ppi->cpi; + + if (ppi->cpi->oxcf.pass != 1 && + ppi->cpi->common.current_frame.frame_number > 0) { + aom_clear_system_state(); + char headings[512] = { 0 }; + char results[512] = { 0 }; + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = + (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) / + 10000000.000; + double total_encode_time = + (ppi->total_time_receive_data + ppi->total_time_compress_data) / + 1000.000; + const double dr = + (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded; + const double peak = + (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1); + const double target_rate = + (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000; + const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); + + if (ppi->b_calculate_psnr) { + const double total_psnr = aom_sse_to_psnr( + (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]); + const double total_ssim = + 100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0); + snprintf(headings, sizeof(headings), + "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" + "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsrnY\tAPsnrCb\tAPsnrCr"); + snprintf(results, sizeof(results), + "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", + dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr, + ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr, + total_ssim, total_ssim, + ppi->fastssim.stat[STAT_ALL] / ppi->count[0], + ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst, + ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst, + ppi->psnr[0].stat[STAT_Y] / ppi->count[0], + ppi->psnr[0].stat[STAT_U] / ppi->count[0], + ppi->psnr[0].stat[STAT_V] / ppi->count[0]); + + if (ppi->b_calculate_blockiness) { + SNPRINT(headings, "\t Block\tWstBlck"); + SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]); + SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness); + } + + if (ppi->b_calculate_consistency) { + double consistency = + aom_sse_to_psnr((double)ppi->total_samples[0], peak, + (double)ppi->total_inconsistency); + + SNPRINT(headings, "\tConsist\tWstCons"); + SNPRINT2(results, "\t%7.3f", consistency); + SNPRINT2(results, "\t%7.3f", ppi->worst_consistency); + } + + SNPRINT(headings, "\t Time\tRcErr\tAbsErr"); + SNPRINT2(results, "\t%8.0f", total_encode_time); + SNPRINT2(results, " %7.2f", rate_err); + SNPRINT2(results, " %7.2f", fabs(rate_err)); + + SNPRINT(headings, "\tAPsnr611"); + SNPRINT2(results, " %7.3f", + (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] + + ppi->psnr[0].stat[STAT_V]) / + (ppi->count[0] * 8)); + +#if CONFIG_AV1_HIGHBITDEPTH + const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth; + const uint32_t bit_depth = ppi->seq_params.bit_depth; + // Since cpi->source->flags is not available here, but total_samples[1] + // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was + // true in compute_internal_stats + if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) { + const double peak_hbd = (double)((1 << bit_depth) - 1); + const double total_psnr_hbd = + aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd, + (double)ppi->total_sq_error[1]); + const double total_ssim_hbd = + 100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0); + SNPRINT(headings, + "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH" + " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH" + " AOMSSIMH VPSSIMPH WstSsimH"); + SNPRINT2(results, "\t%7.3f", + ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]); + SNPRINT2(results, " %7.3f", total_psnr_hbd); + SNPRINT2(results, " %7.3f", + ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]); + SNPRINT2(results, " %7.3f", total_psnr_hbd); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]); + SNPRINT2(results, " %7.3f", ppi->psnr[1].worst); + SNPRINT2(results, " %7.3f", total_ssim_hbd); + SNPRINT2(results, " %7.3f", total_ssim_hbd); + SNPRINT2(results, " %7.3f", ppi->worst_ssim_hbd); + } +#endif + fprintf(f, "%s\n", headings); + fprintf(f, "%s\n", results); + } + + fclose(f); + + if (ppi->ssim_vars != NULL) { + aom_free(ppi->ssim_vars); + ppi->ssim_vars = NULL; + } } } #endif // CONFIG_INTERNAL_STATS +void av1_post_encode_updates(AV1_COMP *const cpi, size_t size, + int64_t time_stamp, int64_t time_end) { + AV1_PRIMARY *const ppi = cpi->ppi; + AV1_COMMON *const cm = &cpi->common; + // Note *size = 0 indicates a dropped frame for which psnr is not calculated + if (ppi->b_calculate_psnr && size > 0) { + if (cm->show_existing_frame || + (!is_stat_generation_stage(cpi) && cm->show_frame)) { + generate_psnr_packet(cpi); + } + } + + if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) { + // Initialize level info. at the beginning of each sequence. + if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) { + av1_init_level_info(cpi); + } + av1_update_level_info(cpi, size, time_stamp, time_end); + } + +#if CONFIG_INTERNAL_STATS + if (!is_stat_generation_stage(cpi)) { + compute_internal_stats(cpi, (int)size); + } +#endif // CONFIG_INTERNAL_STATS +} + int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, - size_t *size, uint8_t *dest, int64_t *time_stamp, - int64_t *time_end, int flush, + size_t *size, size_t avail_size, uint8_t *dest, + int64_t *time_stamp, int64_t *time_end, int flush, const aom_rational64_t *timestamp_ratio) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; AV1_COMMON *const cm = &cpi->common; +#if CONFIG_INTERNAL_STATS + cpi->frame_recode_hits = 0; + cpi->time_compress_data = 0; + cpi->bytes = 0; +#endif +#if CONFIG_ENTROPY_STATS + if (cpi->compressor_stage == ENCODE_STAGE) { + av1_zero(cpi->counts); + } +#endif + #if CONFIG_BITSTREAM_DEBUG assert(cpi->oxcf.max_threads <= 1 && "bitstream debug tool does not support multithreading"); @@ -3685,12 +3918,13 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 + cm->show_frame); #endif - if (cpi->use_svc && cm->number_spatial_layers > 1) { + if (cpi->ppi->use_svc && cpi->ppi->number_spatial_layers > 1) { av1_one_pass_cbr_svc_start_layer(cpi); } cm->showable_frame = 0; *size = 0; + cpi->available_bs_size = avail_size; #if CONFIG_INTERNAL_STATS struct aom_usec_timer cmptimer; aom_usec_timer_start(&cmptimer); @@ -3763,27 +3997,9 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, aom_usec_timer_mark(&cmptimer); cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer); #endif // CONFIG_INTERNAL_STATS - // Note *size = 0 indicates a dropped frame for which psnr is not calculated - if (cpi->b_calculate_psnr && *size > 0) { - if (cm->show_existing_frame || - (!is_stat_generation_stage(cpi) && cm->show_frame)) { - generate_psnr_packet(cpi); - } - } - if (cpi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) { - // Initialize level info. at the beginning of each sequence. - if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) { - av1_init_level_info(cpi); - } - av1_update_level_info(cpi, *size, *time_stamp, *time_end); - } + av1_post_encode_updates(cpi, *size, *time_stamp, *time_end); -#if CONFIG_INTERNAL_STATS - if (!is_stat_generation_stage(cpi)) { - compute_internal_stats(cpi, (int)(*size)); - } -#endif // CONFIG_INTERNAL_STATS #if CONFIG_SPEED_STATS if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) { cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count; @@ -3806,8 +4022,8 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { *dest = cm->cur_frame->buf; dest->y_width = cm->width; dest->y_height = cm->height; - dest->uv_width = cm->width >> cm->seq_params.subsampling_x; - dest->uv_height = cm->height >> cm->seq_params.subsampling_y; + dest->uv_width = cm->width >> cm->seq_params->subsampling_x; + dest->uv_height = cm->height >> cm->seq_params->subsampling_y; ret = 0; } else { ret = -1; @@ -3829,12 +4045,12 @@ aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, YV12_BUFFER_CONFIG *sd) { const int num_planes = av1_num_planes(cm); if (!equal_dimensions_and_border(new_frame, sd)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Incorrect buffer dimensions"); else aom_yv12_copy_frame(new_frame, sd, num_planes); - return cm->error.error_code; + return cm->error->error_code; } int av1_set_internal_size(AV1EncoderConfig *const oxcf, @@ -3919,7 +4135,7 @@ int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) { return AOM_CODEC_OK; } -static void svc_set_updates_external_ref_frame_config( +static void svc_set_updates_ref_frame_config( ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags, SVC *const svc) { ext_refresh_frame_flags->update_pending = 1; ext_refresh_frame_flags->last_frame = svc->refresh[svc->ref_idx[0]]; @@ -3980,7 +4196,7 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { av1_use_as_reference(&ext_flags->ref_frame_flags, ref); } else { - if (cpi->svc.external_ref_frame_config) { + if (cpi->svc.set_ref_frame_config) { int ref = svc_set_references_external_ref_frame_config(cpi); av1_use_as_reference(&ext_flags->ref_frame_flags, ref); } @@ -4008,9 +4224,8 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0; ext_refresh_frame_flags->update_pending = 1; } else { - if (cpi->svc.external_ref_frame_config) - svc_set_updates_external_ref_frame_config(ext_refresh_frame_flags, - &cpi->svc); + if (cpi->svc.set_ref_frame_config) + svc_set_updates_ref_frame_config(ext_refresh_frame_flags, &cpi->svc); else ext_refresh_frame_flags->update_pending = 0; } @@ -4030,12 +4245,12 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { } } -aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) { - if (!cpi) return NULL; +aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) { + if (!ppi) return NULL; uint8_t header_buf[512] = { 0 }; const uint32_t sequence_header_size = - av1_write_sequence_header_obu(&cpi->common.seq_params, &header_buf[0]); + av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]); assert(sequence_header_size <= sizeof(header_buf)); if (sequence_header_size == 0) return NULL; @@ -4046,7 +4261,8 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) { if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL; memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size); - if (av1_write_obu_header(&cpi->level_params, OBU_SEQUENCE_HEADER, 0, + if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count, + OBU_SEQUENCE_HEADER, 0, &header_buf[0]) != obu_header_size) { return NULL; } diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder.h b/third_party/libaom/source/libaom/av1/encoder/encoder.h index 905470f437..fe6e76f498 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encoder.h +++ b/third_party/libaom/source/libaom/av1/encoder/encoder.h @@ -35,6 +35,7 @@ #include "av1/encoder/block.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/encodemb.h" +#include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/global_motion.h" #include "av1/encoder/level.h" @@ -49,6 +50,7 @@ #include "av1/encoder/tokenize.h" #include "av1/encoder/tpl_model.h" #include "av1/encoder/av1_noise_estimate.h" +#include "av1/encoder/bitstream.h" #if CONFIG_INTERNAL_STATS #include "aom_dsp/ssim.h" @@ -119,6 +121,26 @@ enum { FRAMEFLAGS_ERROR_RESILIENT = 1 << 6, } UENUM1BYTE(FRAMETYPE_FLAGS); +#if CONFIG_FRAME_PARALLEL_ENCODE +// 0 level frames are sometimes used for rate control purposes, but for +// reference mapping purposes, the minimum level should be 1. +#define MIN_PYR_LEVEL 1 +static INLINE int get_true_pyr_level(int frame_level, int frame_order, + int max_layer_depth) { + if (frame_order == 0) { + // Keyframe case + return MIN_PYR_LEVEL; + } else if (frame_level == MAX_ARF_LAYERS) { + // Leaves + return max_layer_depth; + } else if (frame_level == (MAX_ARF_LAYERS + 1)) { + // Altrefs + return MIN_PYR_LEVEL; + } + return AOMMAX(MIN_PYR_LEVEL, frame_level); +} +#endif // CONFIG_FRAME_PARALLEL_ENCODE + enum { NO_AQ = 0, VARIANCE_AQ = 1, @@ -159,13 +181,6 @@ enum { /*!\cond */ typedef enum { - COST_UPD_SB, - COST_UPD_SBROW, - COST_UPD_TILE, - COST_UPD_OFF, -} COST_UPDATE_TYPE; - -typedef enum { MOD_FP, // First pass MOD_TF, // Temporal filtering MOD_TPL, // TPL @@ -173,12 +188,24 @@ typedef enum { MOD_ENC, // Encode stage MOD_LPF, // Deblocking loop filter MOD_CDEF_SEARCH, // CDEF search + MOD_CDEF, // CDEF frame MOD_LR, // Loop restoration filtering + MOD_PACK_BS, // Pack bitstream NUM_MT_MODULES } MULTI_THREADED_MODULES; /*!\endcond */ +/*!\enum COST_UPDATE_TYPE + * \brief This enum controls how often the entropy costs should be updated. + */ +typedef enum { + COST_UPD_SB, /*!< Update every sb. */ + COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */ + COST_UPD_TILE, /*!< Update every tile. */ + COST_UPD_OFF, /*!< Turn off cost updates. */ +} COST_UPDATE_TYPE; + /*! * \brief Encoder config related to resize. */ @@ -623,6 +650,8 @@ typedef struct { COST_UPDATE_TYPE mode; // Indicates the update frequency for mv costs. COST_UPDATE_TYPE mv; + // Indicates the update frequency for dv costs. + COST_UPDATE_TYPE dv; } CostUpdateFreq; typedef struct { @@ -711,7 +740,10 @@ typedef struct { */ typedef struct { /*! - * Indicates the loop filter sharpness. + * Controls the level at which rate-distortion optimization of transform + * coefficients favours sharpness in the block. Has no impact on RD when set + * to zero (default). For values 1-7, eob and skip block optimization are + * avoided and rdmult is adjusted in favour of block sharpness. */ int sharpness; @@ -940,6 +972,10 @@ typedef struct AV1EncoderConfig { // format. bool save_as_annexb; + // The path for partition stats reading and writing, used in the experiment + // CONFIG_PARTITION_SEARCH_ORDER. + const char *partition_info_path; + /*!\endcond */ } AV1EncoderConfig; @@ -1267,6 +1303,7 @@ typedef struct TileDataEnc { TileInfo tile_info; DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); FRAME_CONTEXT *row_ctx; + uint64_t abs_sum_level; uint8_t allow_update_cdf; InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; AV1EncRowMultiThreadSync row_mt_sync; @@ -1295,14 +1332,23 @@ typedef struct ThreadData { PALETTE_BUFFER *palette_buffer; CompoundTypeRdBuffers comp_rd_buffer; CONV_BUF_TYPE *tmp_conv_dst; + uint64_t abs_sum_level; uint8_t *tmp_pred_bufs[2]; int intrabc_used; int deltaq_used; + int coefficient_size; + int max_mv_magnitude; + int interp_filter_selected[SWITCHABLE]; FRAME_CONTEXT *tctx; VP64x64 *vt64x64; int32_t num_64x64_blocks; PICK_MODE_CONTEXT *firstpass_ctx; TemporalFilterData tf_data; + TplTxfmStats tpl_txfm_stats; + // Pointer to the array of structures to store gradient information of each + // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level + // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV). + PixelLevelGradientInfo *pixel_gradient_info; } ThreadData; struct EncWorkerData; @@ -1427,6 +1473,11 @@ typedef struct MultiThreadInfo { AV1LrSync lr_row_sync; /*! + * Pack bitstream multi-threading object. + */ + AV1EncPackBSSync pack_bs_sync; + + /*! * Global Motion multi-threading object. */ AV1GlobalMotionSync gm_sync; @@ -1440,6 +1491,11 @@ typedef struct MultiThreadInfo { * CDEF search multi-threading object. */ AV1CdefSync cdef_sync; + + /*! + * CDEF row multi-threading data. + */ + AV1CdefWorkerData *cdef_worker; } MultiThreadInfo; /*!\cond */ @@ -1561,10 +1617,13 @@ enum { rd_pick_sb_modes_time, av1_rd_pick_intra_mode_sb_time, av1_rd_pick_inter_mode_sb_time, + set_params_rd_pick_inter_mode_time, + skip_inter_mode_time, handle_inter_mode_time, evaluate_motion_mode_for_winner_candidates_time, - handle_intra_mode_time, do_tx_search_time, + handle_intra_mode_time, + refine_winner_mode_tx_time, av1_search_palette_mode_time, handle_newmv_time, compound_type_rd_time, @@ -1609,11 +1668,15 @@ static INLINE char const *get_component_name(int index) { return "av1_rd_pick_intra_mode_sb_time"; case av1_rd_pick_inter_mode_sb_time: return "av1_rd_pick_inter_mode_sb_time"; + case set_params_rd_pick_inter_mode_time: + return "set_params_rd_pick_inter_mode_time"; + case skip_inter_mode_time: return "skip_inter_mode_time"; case handle_inter_mode_time: return "handle_inter_mode_time"; case evaluate_motion_mode_for_winner_candidates_time: return "evaluate_motion_mode_for_winner_candidates_time"; - case handle_intra_mode_time: return "handle_intra_mode_time"; case do_tx_search_time: return "do_tx_search_time"; + case handle_intra_mode_time: return "handle_intra_mode_time"; + case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time"; case av1_search_palette_mode_time: return "av1_search_palette_mode_time"; case handle_newmv_time: return "handle_newmv_time"; case compound_type_rd_time: return "compound_type_rd_time"; @@ -2045,12 +2108,88 @@ typedef struct { uint8_t *entropy_ctx; } CoeffBufferPool; +#if CONFIG_FRAME_PARALLEL_ENCODE +/*! + * \brief Max number of frames that can be encoded in a parallel encode set. + */ +#define MAX_PARALLEL_FRAMES 4 + +/*! + * \brief Structure to hold data of frame encoded in a given parallel encode + * set. + */ +typedef struct AV1_FP_OUT_DATA { + /*! + * Buffer to store packed bitstream data of a frame. + */ + unsigned char *cx_data_frame; + + /*! + * Allocated size of the cx_data_frame buffer. + */ + size_t cx_data_sz; + + /*! + * Size of data written in the cx_data_frame buffer. + */ + size_t frame_size; + + /*! + * Display order hint of frame whose packed data is in cx_data_frame buffer. + */ + int frame_display_order_hint; +} AV1_FP_OUT_DATA; +#endif // CONFIG_FRAME_PARALLEL_ENCODE + /*! * \brief Top level primary encoder structure */ typedef struct AV1_PRIMARY { +#if CONFIG_FRAME_PARALLEL_ENCODE + /*! + * Array of frame level encoder stage top level structures + */ + struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES]; + + /*! + * Number of frame level contexts(cpis) + */ + int num_fp_contexts; + + /*! + * Array of structures to hold data of frames encoded in a given parallel + * encode set. + */ + struct AV1_FP_OUT_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1]; + + /*! + * Loopfilter levels of the previous encoded frame. + */ + int filter_level[2]; + int filter_level_u; + int filter_level_v; + + /*! + * Largest MV component used in previous encoded frame during + * stats consumption stage. + */ + int max_mv_magnitude; + + /*! + * Temporary variable simulating the delayed frame_probability update. + */ + FrameProbInfo temp_frame_probs; + + /*! + * Temporary variable used in simulating the delayed update of + * avg_frame_qindex. + */ + int temp_avg_frame_qindex[FRAME_TYPES]; +#endif // CONFIG_FRAME_PARALLEL_ENCODE /*! * Encode stage top level structure + * When CONFIG_FRAME_PARALLEL_ENCODE is enabled this is the same as + * parallel_cpi[0] */ struct AV1_COMP *cpi; @@ -2063,6 +2202,186 @@ typedef struct AV1_PRIMARY { * Look-ahead context. */ struct lookahead_ctx *lookahead; + + /*! + * Sequence parameters have been transmitted already and locked + * or not. Once locked av1_change_config cannot change the seq + * parameters. + */ + int seq_params_locked; + + /*! + * Pointer to internal utility functions that manipulate aom_codec_* data + * structures. + */ + struct aom_codec_pkt_list *output_pkt_list; + + /*! + * When set, indicates that internal ARFs are enabled. + */ + int internal_altref_allowed; + + /*! + * Information related to a gf group. + */ + GF_GROUP gf_group; + + /*! + * Track prior gf group state. + */ + GF_STATE gf_state; + + /*! + * Flag indicating whether look ahead processing (LAP) is enabled. + */ + int lap_enabled; + + /*! + * Parameters for AV1 bitstream levels. + */ + AV1LevelParams level_params; + + /*! + * Calculates PSNR on each frame when set to 1. + */ + int b_calculate_psnr; + + /*! + * Number of frames left to be encoded, is 0 if limit is not set. + */ + int frames_left; + + /*! + * Information related to two pass encoding. + */ + TWO_PASS twopass; + + /*! + * Rate control related parameters. + */ + PRIMARY_RATE_CONTROL p_rc; + + /*! + * Frame buffer holding the temporally filtered source frame. It can be KEY + * frame or ARF frame. + */ + YV12_BUFFER_CONFIG alt_ref_buffer; + + /*! + * Elements part of the sequence header, that are applicable for all the + * frames in the video. + */ + SequenceHeader seq_params; + + /*! + * Indicates whether to use SVC. + */ + int use_svc; + + /*! + * If true, buffer removal times are present. + */ + bool buffer_removal_time_present; + + /*! + * Number of temporal layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_temporal_layers; + + /*! + * Number of spatial layers: may be > 1 for SVC (scalable vector coding). + */ + unsigned int number_spatial_layers; + + /*! + * Code and details about current error status. + */ + struct aom_internal_error_info error; + + /*! + * Function pointers to variants of sse/sad/variance computation functions. + * fn_ptr[i] indicates the list of function pointers corresponding to block + * size i. + */ + aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL]; + + /*! + * Scaling factors used in the RD multiplier modulation. + * TODO(sdeng): consider merge the following arrays. + * tpl_rdmult_scaling_factors is a temporary buffer used to store the + * intermediate scaling factors which are used in the calculation of + * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the + * intermediate scaling factor of the ith 16 x 16 block in raster scan order. + */ + double *tpl_rdmult_scaling_factors; + + /*! + * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of + * the ith 16 x 16 block in raster scan order. + */ + double *tpl_sb_rdmult_scaling_factors; + + /*! + * Parameters related to tpl. + */ + TplParams tpl_data; + + /*! + * Motion vector stats of the previous encoded frame. + */ + MV_STATS mv_stats; + +#if CONFIG_INTERNAL_STATS + /*!\cond */ + uint64_t total_time_receive_data; + uint64_t total_time_compress_data; + + unsigned int total_mode_chosen_counts[MAX_MODES]; + + int count[2]; + uint64_t total_sq_error[2]; + uint64_t total_samples[2]; + ImageStat psnr[2]; + + double total_blockiness; + double worst_blockiness; + + int total_bytes; + double summed_quality; + double summed_weights; + double summed_quality_hbd; + double summed_weights_hbd; + unsigned int total_recode_hits; + double worst_ssim; + double worst_ssim_hbd; + + ImageStat fastssim; + ImageStat psnrhvs; + + int b_calculate_blockiness; + int b_calculate_consistency; + + double total_inconsistency; + double worst_consistency; + Ssimv *ssim_vars; + Metrics metrics; + /*!\endcond */ +#endif + +#if CONFIG_ENTROPY_STATS + /*! + * Aggregates frame counts for the sequence. + */ + FRAME_COUNTS aggregate_fc; +#endif // CONFIG_ENTROPY_STATS + + /*! + * For each type of reference frame, this contains the index of a reference + * frame buffer for a reference frame of the same type. We use this to + * choose our primary reference frame (which is the most recent reference + * frame of the same type as the current frame). + */ + int fb_of_context_type[REF_FRAMES]; } AV1_PRIMARY; /*! @@ -2173,9 +2492,9 @@ typedef struct AV1_COMP { YV12_BUFFER_CONFIG *unfiltered_source; /*! - * Parameters related to tpl. + * Skip tpl setup when tpl data from gop length decision can be reused. */ - TplParams tpl_data; + int skip_tpl_setup_stats; /*! * Temporal filter context. @@ -2209,14 +2528,6 @@ typedef struct AV1_COMP { RefreshFrameFlagsInfo refresh_frame; /*! - * For each type of reference frame, this contains the index of a reference - * frame buffer for a reference frame of the same type. We use this to - * choose our primary reference frame (which is the most recent reference - * frame of the same type as the current frame). - */ - int fb_of_context_type[REF_FRAMES]; - - /*! * Flags signalled by the external interface at frame level. */ ExternalFlags ext_flags; @@ -2275,12 +2586,6 @@ typedef struct AV1_COMP { double framerate; /*! - * Pointer to internal utility functions that manipulate aom_codec_* data - * structures. - */ - struct aom_codec_pkt_list *output_pkt_list; - - /*! * Bitmask indicating which reference buffers may be referenced by this frame. */ int ref_frame_flags; @@ -2322,26 +2627,9 @@ typedef struct AV1_COMP { ActiveMap active_map; /*! - * Function pointers to variants of sse/sad/variance computation functions. - * fn_ptr[i] indicates the list of function pointers corresponding to block - * size i. - */ - aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL]; - - /*! - * Information related to two pass encoding. - */ - TWO_PASS twopass; - - /*! - * Information related to a gf group. - */ - GF_GROUP gf_group; - - /*! - * Track prior gf group state. + * The frame processing order within a GOP. */ - GF_STATE gf_state; + unsigned char gf_frame_index; /*! * To control the reference frame buffer and selection. @@ -2349,58 +2637,20 @@ typedef struct AV1_COMP { RefBufferStack ref_buffer_stack; /*! - * Frame buffer holding the temporally filtered source frame. It can be KEY - * frame or ARF frame. - */ - YV12_BUFFER_CONFIG alt_ref_buffer; - - /*! * Tell if OVERLAY frame shows existing alt_ref frame. */ int show_existing_alt_ref; #if CONFIG_INTERNAL_STATS /*!\cond */ - uint64_t time_receive_data; uint64_t time_compress_data; unsigned int mode_chosen_counts[MAX_MODES]; - - int count[2]; - uint64_t total_sq_error[2]; - uint64_t total_samples[2]; - ImageStat psnr[2]; - - double total_blockiness; - double worst_blockiness; - int bytes; - double summed_quality; - double summed_weights; - double summed_quality_hbd; - double summed_weights_hbd; - unsigned int tot_recode_hits; - double worst_ssim; - double worst_ssim_hbd; - - ImageStat fastssim; - ImageStat psnrhvs; - - int b_calculate_blockiness; - int b_calculate_consistency; - - double total_inconsistency; - double worst_consistency; - Ssimv *ssim_vars; - Metrics metrics; + unsigned int frame_recode_hits; /*!\endcond */ #endif - /*! - * Calculates PSNR on each frame when set to 1. - */ - int b_calculate_psnr; - #if CONFIG_SPEED_STATS /*! * For debugging: number of transform searches we have performed. @@ -2458,13 +2708,6 @@ typedef struct AV1_COMP { TokenInfo token_info; /*! - * Sequence parameters have been transmitted already and locked - * or not. Once locked av1_change_config cannot change the seq - * parameters. - */ - int seq_params_locked; - - /*! * VARIANCE_AQ segment map refresh. */ int vaq_refresh; @@ -2492,21 +2735,11 @@ typedef struct AV1_COMP { int existing_fb_idx_to_show; /*! - * When set, indicates that internal ARFs are enabled. - */ - int internal_altref_allowed; - - /*! * A flag to indicate if intrabc is ever used in current frame. */ int intrabc_used; /*! - * Tables to calculate IntraBC MV cost. - */ - IntraBCMVCosts dv_costs; - - /*! * Mark which ref frames can be skipped for encoding current frame during RDO. */ int prune_ref_frame_mask; @@ -2571,9 +2804,9 @@ typedef struct AV1_COMP { #endif /*! - * Parameters for AV1 bitstream levels. + * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation. */ - AV1LevelParams level_params; + int frame_header_count; /*! * Whether any no-zero delta_q was actually used. @@ -2586,20 +2819,6 @@ typedef struct AV1_COMP { RefFrameDistanceInfo ref_frame_dist_info; /*! - * Scaling factors used in the RD multiplier modulation. - * TODO(sdeng): consider merge the following arrays. - * tpl_rdmult_scaling_factors is a temporary buffer used to store the - * intermediate scaling factors which are used in the calculation of - * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the - * intermediate scaling factor of the ith 16 x 16 block in raster scan order. - */ - double *tpl_rdmult_scaling_factors; - /*! - * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of - * the ith 16 x 16 block in raster scan order. - */ - double *tpl_sb_rdmult_scaling_factors; - /*! * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of * the ith 16 x 16 block in raster scan order. This scaling factor is used for * RD multiplier modulation when SSIM tuning is enabled. @@ -2621,30 +2840,16 @@ typedef struct AV1_COMP { #endif /*! - * Indicates whether to use SVC. - */ - int use_svc; - /*! * Parameters for scalable video coding. */ SVC svc; /*! - * Flag indicating whether look ahead processing (LAP) is enabled. - */ - int lap_enabled; - /*! * Indicates whether current processing stage is encode stage or LAP stage. */ COMPRESSOR_STAGE compressor_stage; /*! - * Some motion vector stats from the last encoded frame to help us decide what - * precision to use to encode the current frame. - */ - MV_STATS mv_stats; - - /*! * Frame type of the last frame. May be used in some heuristics for speeding * up the encoding. */ @@ -2686,14 +2891,35 @@ typedef struct AV1_COMP { uint8_t *consec_zero_mv; /*! - * Number of frames left to be encoded, is 0 if limit is not set. + * Block size of first pass encoding */ - int frames_left; + BLOCK_SIZE fp_block_size; /*! - * Block size of first pass encoding + * The counter of encoded super block, used to differentiate block names. + * This number starts from 0 and increases whenever a super block is encoded. */ - BLOCK_SIZE fp_block_size; + int sb_counter; + + /*! + * Available bitstream buffer size in bytes + */ + size_t available_bs_size; + + /*! + * The controller of the external partition model. + * It is used to do partition type selection based on external models. + */ + ExtPartController ext_part_controller; + +#if CONFIG_FRAME_PARALLEL_ENCODE + /*! + * A flag to indicate frames that will update their data to the primary + * context at the end of the encode. It is set for non-parallel frames and the + * last frame in encode order in a given parallel encode set. + */ + bool do_frame_data_update; +#endif } AV1_COMP; /*! @@ -2773,26 +2999,39 @@ void av1_initialize_enc(void); struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf, BufferPool *const pool, - FIRSTPASS_STATS *frame_stats_buf, COMPRESSOR_STAGE stage, - int num_lap_buffers, - int lap_lag_in_frames, - STATS_BUFFER_CTX *stats_buf_context); + int lap_lag_in_frames); -struct AV1_PRIMARY *av1_create_primary_compressor(); +struct AV1_PRIMARY *av1_create_primary_compressor( + struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers, + AV1EncoderConfig *oxcf); void av1_remove_compressor(AV1_COMP *cpi); void av1_remove_primary_compressor(AV1_PRIMARY *ppi); -void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf); +#if CONFIG_ENTROPY_STATS +void print_entropy_stats(AV1_PRIMARY *const ppi); +#endif +#if CONFIG_INTERNAL_STATS +void print_internal_stats(AV1_PRIMARY *ppi); +#endif + +void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf, + bool *sb_size_changed); + +void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf, + bool sb_size_changed); void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, int subsampling_x, int subsampling_y); -void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm, +void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi, const AV1EncoderConfig *oxcf, int use_svc); +void av1_post_encode_updates(AV1_COMP *const cpi, size_t size, + int64_t time_stamp, int64_t time_end); + /*!\endcond */ /*!\brief Obtain the raw frame data @@ -2827,6 +3066,7 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, * \param[in] cpi Top-level encoder structure * \param[in] frame_flags Flags to decide how to encoding the frame * \param[in] size Bitstream size + * \param[in] avail_size Available bitstream buffer size * \param[in] dest Bitstream output * \param[out] time_stamp Time stamp of the frame * \param[out] time_end Time end @@ -2840,8 +3080,8 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, * \retval #AOM_CODEC_ERROR */ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, - size_t *size, uint8_t *dest, int64_t *time_stamp, - int64_t *time_end, int flush, + size_t *size, size_t avail_size, uint8_t *dest, + int64_t *time_stamp, int64_t *time_end, int flush, const aom_rational64_t *timebase); /*!\brief Run 1-pass/2-pass encoding @@ -2902,6 +3142,71 @@ void av1_set_screen_content_options(struct AV1_COMP *cpi, void av1_update_frame_size(AV1_COMP *cpi); +#if CONFIG_FRAME_PARALLEL_ENCODE +typedef struct { + int pyr_level; + int disp_order; +} RefFrameMapPair; + +static INLINE void init_ref_map_pair( + AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) { + if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) { + memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES); + return; + } + memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES); + for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) { + // Get reference frame buffer. + const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx]; + if (ref_frame_map_pairs[map_idx].disp_order == -1) continue; + if (buf == NULL) { + ref_frame_map_pairs[map_idx].disp_order = -1; + ref_frame_map_pairs[map_idx].pyr_level = -1; + continue; + } else if (buf->ref_count > 1) { + // Once the keyframe is coded, the slots in ref_frame_map will all + // point to the same frame. In that case, all subsequent pointers + // matching the current are considered "free" slots. This will find + // the next occurance of the current pointer if ref_count indicates + // there are multiple instances of it and mark it as free. + for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) { + const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2]; + if (buf2 == buf) { + ref_frame_map_pairs[idx2].disp_order = -1; + ref_frame_map_pairs[idx2].pyr_level = -1; + } + } + } + ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint; + ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level; + } +} + +static AOM_INLINE void calc_frame_data_update_flag( + GF_GROUP *const gf_group, int gf_frame_index, + bool *const do_frame_data_update) { + *do_frame_data_update = true; + // Set the flag to false for all frames in a given parallel encode set except + // the last frame in the set with frame_parallel_level = 2. + if (gf_group->frame_parallel_level[gf_frame_index] == 1) { + *do_frame_data_update = false; + } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) { + // Check if this is the last frame in the set with frame_parallel_level = 2. + for (int i = gf_frame_index + 1; i < gf_group->size; i++) { + if ((gf_group->frame_parallel_level[i] == 0 && + (gf_group->update_type[i] == ARF_UPDATE || + gf_group->update_type[i] == INTNL_ARF_UPDATE)) || + gf_group->frame_parallel_level[i] == 1) { + break; + } else if (gf_group->frame_parallel_level[i] == 2) { + *do_frame_data_update = false; + break; + } + } + } +} +#endif // CONFIG_FRAME_PARALLEL_ENCODE + // TODO(jingning): Move these functions as primitive members for the new cpi // class. static INLINE void stack_push(int *stack, int *stack_size, int item) { @@ -2949,8 +3254,9 @@ ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) { } static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { - const GF_GROUP *const gf_group = &cpi->gf_group; - const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const FRAME_UPDATE_TYPE update_type = + gf_group->update_type[cpi->gf_frame_index]; return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE || update_type == GF_UPDATE; @@ -3009,10 +3315,25 @@ static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) { return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf; } +static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) { + return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) && + (gf_cfg->gf_min_pyr_height == 0); +} + +static AOM_INLINE int use_ml_model_to_decide_flat_gop( + const RateControlCfg *rc_cfg) { + return (rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 200); +} + +// Helper function to compute number of blocks on either side of the frame. +static INLINE int get_num_blocks(const int frame_length, const int mb_length) { + return (frame_length + mb_length - 1) / mb_length; +} + // Check if statistics generation stage static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) { assert(IMPLIES(cpi->compressor_stage == LAP_STAGE, - cpi->oxcf.pass == 0 && cpi->lap_enabled)); + cpi->oxcf.pass == 0 && cpi->ppi->lap_enabled)); return (cpi->oxcf.pass == 1 || (cpi->compressor_stage == LAP_STAGE)); } // Check if statistics consumption stage @@ -3024,7 +3345,7 @@ static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) { static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) { return (is_stat_consumption_stage_twopass(cpi) || (cpi->oxcf.pass == 0 && (cpi->compressor_stage == ENCODE_STAGE) && - cpi->lap_enabled)); + cpi->ppi->lap_enabled)); } /*!\endcond */ @@ -3037,11 +3358,18 @@ static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) { * \return 0 if no stats for current stage else 1 */ static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) { - assert(IMPLIES(!cpi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE)); - return (cpi->oxcf.pass == 0 && !cpi->lap_enabled); + assert( + IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE)); + return (cpi->oxcf.pass == 0 && !cpi->ppi->lap_enabled); } + /*!\cond */ +static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) { + return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME && + cpi->oxcf.gf_cfg.lag_in_frames == 0; +} + // Function return size of frame stats buffer static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) { /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */ @@ -3208,7 +3536,7 @@ static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf, // Note: The OBU returned is in Low Overhead Bitstream Format. Specifically, // the obu_has_size_field bit is set, and the buffer contains the obu_size // field. -aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi); +aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi); #define MAX_GFUBOOST_FACTOR 10.0 #define MIN_GFUBOOST_FACTOR 4.0 @@ -3229,9 +3557,9 @@ static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group, } // Get update type of the current frame. -static INLINE FRAME_UPDATE_TYPE -get_frame_update_type(const GF_GROUP *gf_group) { - return gf_group->update_type[gf_group->index]; +static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group, + int gf_frame_index) { + return gf_group->update_type[gf_frame_index]; } static INLINE int av1_pixels_to_mi(int pixels) { @@ -3241,14 +3569,15 @@ static INLINE int av1_pixels_to_mi(int pixels) { static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; - return cpi->b_calculate_psnr && !is_stat_generation_stage(cpi) && + return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) && cm->show_frame; } #if CONFIG_AV1_TEMPORAL_DENOISING static INLINE int denoise_svc(const struct AV1_COMP *const cpi) { - return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >= - cpi->svc.first_layer_denoise)); + return (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && + cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); } #endif diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder_alloc.h b/third_party/libaom/source/libaom/av1/encoder/encoder_alloc.h index eae34e0fe6..6eb44e7ee1 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encoder_alloc.h +++ b/third_party/libaom/source/libaom/av1/encoder/encoder_alloc.h @@ -56,7 +56,7 @@ static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) { TokenInfo *token_info = &cpi->token_info; if (av1_alloc_context_buffers(cm, cm->width, cm->height)) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate context buffers"); } @@ -78,6 +78,13 @@ static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->td.mb.mv_costs, (MvCosts *)aom_calloc(1, sizeof(MvCosts))); + if (cpi->td.mb.dv_costs) { + aom_free(cpi->td.mb.dv_costs); + cpi->td.mb.dv_costs = NULL; + } + CHECK_MEM_ERROR(cm, cpi->td.mb.dv_costs, + (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.mb.dv_costs))); + av1_setup_shared_coeff_buffer(&cpi->common, &cpi->td.shared_coeff_buf); av1_setup_sms_tree(cpi, &cpi->td); cpi->td.firstpass_ctx = @@ -186,19 +193,10 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->ssim_rdmult_scaling_factors); cpi->ssim_rdmult_scaling_factors = NULL; - aom_free(cpi->tpl_rdmult_scaling_factors); - cpi->tpl_rdmult_scaling_factors = NULL; - - aom_free(cpi->tpl_sb_rdmult_scaling_factors); - cpi->tpl_sb_rdmult_scaling_factors = NULL; - #if CONFIG_TUNE_VMAF aom_free(cpi->vmaf_info.rdmult_scaling_factors); cpi->vmaf_info.rdmult_scaling_factors = NULL; - -#if CONFIG_USE_VMAF_RC - aom_close_vmaf_model_rc(cpi->vmaf_info.vmaf_model); -#endif + aom_close_vmaf_model(cpi->vmaf_info.vmaf_model); #endif #if CONFIG_TUNE_BUTTERAUGLI @@ -215,6 +213,11 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { cpi->td.mb.mv_costs = NULL; } + if (cpi->td.mb.dv_costs) { + aom_free(cpi->td.mb.dv_costs); + cpi->td.mb.dv_costs = NULL; + } + aom_free(cpi->td.mb.inter_modes_info); cpi->td.mb.inter_modes_info = NULL; @@ -235,7 +238,6 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm)); cpi->td.firstpass_ctx = NULL; - av1_free_ref_frame_buffers(cm->buffer_pool); av1_free_txb_buf(cpi); av1_free_context_buffers(cm); @@ -243,10 +245,15 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { #if !CONFIG_REALTIME_ONLY av1_free_restoration_buffers(cm); #endif + + if (!is_stat_generation_stage(cpi)) + av1_free_cdef_buffers(cm, &cpi->mt_info.cdef_worker, + &cpi->mt_info.cdef_sync, + cpi->mt_info.num_mod_workers[MOD_CDEF]); + aom_free_frame_buffer(&cpi->trial_frame_rst); aom_free_frame_buffer(&cpi->scaled_source); aom_free_frame_buffer(&cpi->scaled_last_source); - aom_free_frame_buffer(&cpi->alt_ref_buffer); free_token_info(token_info); @@ -259,6 +266,7 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { for (int j = 0; j < 2; ++j) { aom_free(cpi->td.mb.tmp_pred_bufs[j]); } + aom_free(cpi->td.mb.pixel_gradient_info); #if CONFIG_DENOISE if (cpi->denoise_and_model) { @@ -271,11 +279,7 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { cpi->film_grain_table = NULL; } - for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { - aom_free(cpi->level_params.level_info[i]); - } - - if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi); + if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi); if (cpi->consec_zero_mv) { aom_free(cpi->consec_zero_mv); @@ -285,7 +289,7 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; - const int num_64x64_blocks = (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; + const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4; if (cpi->td.vt64x64) { if (num_64x64_blocks != cpi->td.num_64x64_blocks) { aom_free(cpi->td.vt64x64); @@ -301,7 +305,7 @@ static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) { static AOM_INLINE void alloc_altref_frame_buffer(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const AV1EncoderConfig *oxcf = &cpi->oxcf; // When lag_in_frames <= 1, alt-ref frames are not enabled. In this case, @@ -311,29 +315,29 @@ static AOM_INLINE void alloc_altref_frame_buffer(AV1_COMP *cpi) { // TODO(agrange) Check if ARF is enabled and skip allocation if not. if (aom_realloc_frame_buffer( - &cpi->alt_ref_buffer, oxcf->frm_dim_cfg.width, + &cpi->ppi->alt_ref_buffer, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, cpi->oxcf.tool_cfg.enable_global_motion)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); } static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int byte_alignment = cm->features.byte_alignment; if (aom_realloc_frame_buffer( &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL, 0)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); // The frame buffer trial_frame_rst is used during loop restoration filter // search. Hence it is allocated only when loop restoration is used. - const int use_restoration = cm->seq_params.enable_restoration && + const int use_restoration = cm->seq_params->enable_restoration && !cm->features.all_lossless && !cm->tiles.large_scale; if (use_restoration) { @@ -342,7 +346,7 @@ static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) { cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL, 0)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate trial restored frame buffer"); } @@ -351,7 +355,7 @@ static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) { seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL, cpi->oxcf.tool_cfg.enable_global_motion)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); // The frame buffer cpi->scaled_last_source is used to hold the previous @@ -367,7 +371,7 @@ static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) { seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL, cpi->oxcf.tool_cfg.enable_global_motion)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled last source buffer"); } } @@ -384,16 +388,16 @@ static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source( if (aom_realloc_frame_buffer( &cpi->scaled_source, scaled_width, scaled_height, - cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, - cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL, NULL, cpi->oxcf.tool_cfg.enable_global_motion)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate scaled source buffer"); assert(cpi->scaled_source.y_crop_width == scaled_width); assert(cpi->scaled_source.y_crop_height == scaled_height); av1_resize_and_extend_frame_nonnormative( - cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params.bit_depth, + cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params->bit_depth, num_planes); return &cpi->scaled_source; } diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder_utils.c b/third_party/libaom/source/libaom/av1/encoder/encoder_utils.c index 7a7e8505b4..557268f9d3 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encoder_utils.c +++ b/third_party/libaom/source/libaom/av1/encoder/encoder_utils.c @@ -344,7 +344,7 @@ static void configure_static_seg_features(AV1_COMP *cpi) { seg->update_data = 1; qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2); av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2); @@ -459,13 +459,13 @@ void av1_apply_active_map(AV1_COMP *cpi) { #if !CONFIG_REALTIME_ONLY static void process_tpl_stats_frame(AV1_COMP *cpi) { - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; AV1_COMMON *const cm = &cpi->common; - assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size)); + assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size)); - const int tpl_idx = gf_group->index; - TplParams *const tpl_data = &cpi->tpl_data; + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; @@ -497,22 +497,23 @@ static void process_tpl_stats_frame(AV1_COMP *cpi) { } else { aom_clear_system_state(); cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; - if (is_frame_tpl_eligible(gf_group, gf_group->index)) { - if (cpi->lap_enabled) { - double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval); + if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { + if (cpi->ppi->lap_enabled) { + double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval); const int gfu_boost = get_gfu_boost_from_r0_lap( min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0, - cpi->rc.num_stats_required_for_gfu_boost); + cpi->ppi->p_rc.num_stats_required_for_gfu_boost); // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost, // gfu_boost); - cpi->rc.gfu_boost = combine_prior_with_tpl_boost( - min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost, - gfu_boost, cpi->rc.num_stats_used_for_gfu_boost); + cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost( + min_boost_factor, MAX_BOOST_COMBINE_FACTOR, + cpi->ppi->p_rc.gfu_boost, gfu_boost, + cpi->ppi->p_rc.num_stats_used_for_gfu_boost); } else { const int gfu_boost = (int)(200.0 / cpi->rd.r0); - cpi->rc.gfu_boost = combine_prior_with_tpl_boost( + cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost( MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR, - cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key); + cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key); } } aom_clear_system_state(); @@ -529,17 +530,17 @@ void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, av1_set_speed_features_framesize_dependent(cpi, cpi->speed); #if !CONFIG_REALTIME_ONLY - GF_GROUP *gf_group = &cpi->gf_group; + GF_GROUP *gf_group = &cpi->ppi->gf_group; if (cpi->oxcf.algo_cfg.enable_tpl_model && - is_frame_tpl_eligible(gf_group, gf_group->index)) { + is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) { process_tpl_stats_frame(cpi); av1_tpl_rdmult_setup(cpi); } #endif // Decide q and q bounds. - *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, - cpi->gf_group.index, bottom_index, top_index); + *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index, + bottom_index, top_index); // Configure experimental use of segmentation for enhanced coding of // static regions if indicated. @@ -564,6 +565,23 @@ static void reset_film_grain_chroma_params(aom_film_grain_t *pars) { memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb)); } +void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf) { + SequenceHeader *const seq_params = &ppi->seq_params; + const TuneCfg *const tune_cfg = &oxcf->tune_cfg; + + if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename || + tune_cfg->content == AOM_CONTENT_FILM) { + seq_params->film_grain_params_present = 1; + } else { +#if CONFIG_DENOISE + seq_params->film_grain_params_present = (oxcf->noise_level > 0); +#else + seq_params->film_grain_params_present = 0; +#endif + } +} + void av1_update_film_grain_parameters(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { AV1_COMMON *const cm = &cpi->common; @@ -577,39 +595,30 @@ void av1_update_film_grain_parameters(struct AV1_COMP *cpi, } if (tune_cfg->film_grain_test_vector) { - cm->seq_params.film_grain_params_present = 1; if (cm->current_frame.frame_type == KEY_FRAME) { memcpy(&cm->film_grain_params, film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1, sizeof(cm->film_grain_params)); if (oxcf->tool_cfg.enable_monochrome) reset_film_grain_chroma_params(&cm->film_grain_params); - cm->film_grain_params.bit_depth = cm->seq_params.bit_depth; - if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) { + cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; + if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) { cm->film_grain_params.clip_to_restricted_range = 0; } } } else if (tune_cfg->film_grain_table_filename) { - cm->seq_params.film_grain_params_present = 1; - cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t)); aom_film_grain_table_read(cpi->film_grain_table, - tune_cfg->film_grain_table_filename, &cm->error); + tune_cfg->film_grain_table_filename, cm->error); } else if (tune_cfg->content == AOM_CONTENT_FILM) { - cm->seq_params.film_grain_params_present = 1; - cm->film_grain_params.bit_depth = cm->seq_params.bit_depth; + cm->film_grain_params.bit_depth = cm->seq_params->bit_depth; if (oxcf->tool_cfg.enable_monochrome) reset_film_grain_chroma_params(&cm->film_grain_params); - if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) + if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) cm->film_grain_params.clip_to_restricted_range = 0; } else { -#if CONFIG_DENOISE - cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0); -#else - cm->seq_params.film_grain_params_present = 0; -#endif memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } } @@ -643,7 +652,7 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, if (aom_yv12_realloc_with_new_border( &ref_fb->buf, AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, num_planes) != 0) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } } @@ -652,7 +661,7 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, if (new_fb == NULL) { const int new_fb_idx = get_free_fb(cm); if (new_fb_idx == INVALID_IDX) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Unable to find free frame buffer"); } force_scaling = 1; @@ -663,30 +672,30 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, new_fb->buf.y_crop_height != cm->height) { if (aom_realloc_frame_buffer( &new_fb->buf, cm->width, cm->height, - cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, - cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS, + cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, + cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL, NULL, 0)) { if (force_scaling) { // Release the reference acquired in the get_free_fb() call above. --new_fb->ref_count; } - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } #if CONFIG_AV1_HIGHBITDEPTH - if (use_optimized_scaler && cm->seq_params.bit_depth == AOM_BITS_8) + if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8) av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase, num_planes); else av1_resize_and_extend_frame_nonnormative( - ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes); + ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes); #else if (use_optimized_scaler) av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase, num_planes); else av1_resize_and_extend_frame_nonnormative( - ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes); + ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes); #endif cpi->scaled_ref_buf[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); @@ -704,10 +713,8 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, } } -BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi) { - const AV1_COMMON *const cm = &cpi->common; - const AV1EncoderConfig *const oxcf = &cpi->oxcf; - +BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width, + int height, int number_spatial_layers) { if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) return BLOCK_64X64; if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) @@ -715,7 +722,7 @@ BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi) { assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC); - if (cpi->svc.number_spatial_layers > 1 || + if (number_spatial_layers > 1 || oxcf->resize_cfg.resize_mode != RESIZE_NONE) { // Use the configured size (top resolution) for spatial layers or // on resize. @@ -732,7 +739,7 @@ BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi) { // speed-feature. if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE && oxcf->resize_cfg.resize_mode == RESIZE_NONE && oxcf->speed >= 1) { - return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64; + return AOMMIN(width, height) > 480 ? BLOCK_128X128 : BLOCK_64X64; } return BLOCK_128X128; @@ -753,8 +760,10 @@ void av1_setup_frame(AV1_COMP *cpi) { if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) { - if (!cpi->seq_params_locked) { - set_sb_size(&cm->seq_params, av1_select_sb_size(cpi)); + if (!cpi->ppi->seq_params_locked) { + set_sb_size(cm->seq_params, + av1_select_sb_size(&cpi->oxcf, cm->width, cm->height, + cpi->svc.number_spatial_layers)); } } else { const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm); @@ -959,7 +968,7 @@ void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) { av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq) av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run, 0); @@ -1005,13 +1014,13 @@ void av1_finalize_encoded_frame(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; - if (!cm->seq_params.reduced_still_picture_hdr && + if (!cm->seq_params->reduced_still_picture_hdr && encode_show_existing_frame(cm)) { RefCntBuffer *const frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; if (frame_to_show == NULL) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Buffer does not contain a reconstructed frame"); } assert(frame_to_show->ref_count > 0); @@ -1019,7 +1028,7 @@ void av1_finalize_encoded_frame(AV1_COMP *const cpi) { } if (!encode_show_existing_frame(cm) && - cm->seq_params.film_grain_params_present && + cm->seq_params->film_grain_params_present && (cm->show_frame || cm->showable_frame)) { // Copy the current frame's film grain params to the its corresponding // RefCntBuffer slot. @@ -1232,7 +1241,7 @@ static void save_extra_coding_context(AV1_COMP *cpi) { cc->lf = cm->lf; cc->cdef_info = cm->cdef_info; cc->rc = cpi->rc; - cc->mv_stats = cpi->mv_stats; + cc->mv_stats = cpi->ppi->mv_stats; } void av1_save_all_coding_context(AV1_COMP *cpi) { @@ -1301,11 +1310,11 @@ void av1_dump_filtered_recon_frames(AV1_COMP *cpi) { "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, " "refresh_alt_ref_frame=%d, " "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n", - current_frame->frame_number, cpi->gf_group.index, - cpi->gf_group.update_type[cpi->gf_group.index], current_frame->order_hint, - cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active, - cpi->refresh_frame.alt_ref_frame, recon_buf->y_stride, - recon_buf->uv_stride, cm->width, cm->height); + current_frame->frame_number, cpi->gf_frame_index, + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], + current_frame->order_hint, cm->show_frame, cm->show_existing_frame, + cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame, + recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height); #if 0 int ref_frame; printf("get_ref_frame_map_idx: ["); diff --git a/third_party/libaom/source/libaom/av1/encoder/encoder_utils.h b/third_party/libaom/source/libaom/av1/encoder/encoder_utils.h index 40652e956c..e75bc79ba6 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encoder_utils.h +++ b/third_party/libaom/source/libaom/av1/encoder/encoder_utils.h @@ -125,14 +125,14 @@ static AOM_INLINE void init_buffer_indices( } #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; \ - cpi->fn_ptr[BT].jsdaf = JSDAF; \ - cpi->fn_ptr[BT].jsvaf = JSVAF; + ppi->fn_ptr[BT].sdf = SDF; \ + ppi->fn_ptr[BT].sdaf = SDAF; \ + ppi->fn_ptr[BT].vf = VF; \ + ppi->fn_ptr[BT].svf = SVF; \ + ppi->fn_ptr[BT].svaf = SVAF; \ + ppi->fn_ptr[BT].sdx4df = SDX4DF; \ + ppi->fn_ptr[BT].jsdaf = JSDAF; \ + ppi->fn_ptr[BT].jsvaf = JSVAF; #define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD) \ HIGHBD_BFP( \ @@ -325,8 +325,8 @@ MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg) #endif // CONFIG_AV1_HIGHBITDEPTH #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \ - cpi->fn_ptr[BT].msdf = MCSDF; \ - cpi->fn_ptr[BT].msvf = MCSVF; + ppi->fn_ptr[BT].msdf = MCSDF; \ + ppi->fn_ptr[BT].msvf = MCSVF; #define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD) \ HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT, \ @@ -386,8 +386,8 @@ MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16) #endif #define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \ - cpi->fn_ptr[BT].sdsf = SDSF; \ - cpi->fn_ptr[BT].sdsx4df = SDSX4DF; + ppi->fn_ptr[BT].sdsf = SDSF; \ + ppi->fn_ptr[BT].sdsx4df = SDSX4DF; #define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD) \ HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT, \ @@ -487,9 +487,9 @@ MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d) aom_highbd_obmc_sub_pixel_variance##WIDTH##x##HEIGHT) #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \ - cpi->fn_ptr[BT].osdf = OSDF; \ - cpi->fn_ptr[BT].ovf = OVF; \ - cpi->fn_ptr[BT].osvf = OSVF; + ppi->fn_ptr[BT].osdf = OSDF; \ + ppi->fn_ptr[BT].ovf = OVF; \ + ppi->fn_ptr[BT].osvf = OSVF; #define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD) \ HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \ @@ -542,10 +542,10 @@ MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64) MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16) #endif -static AOM_INLINE void highbd_set_var_fns(AV1_COMP *const cpi) { - AV1_COMMON *const cm = &cpi->common; - if (cm->seq_params.use_highbitdepth) { - switch (cm->seq_params.bit_depth) { +static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) { + SequenceHeader *const seq_params = &ppi->seq_params; + if (seq_params->use_highbitdepth) { + switch (seq_params->bit_depth) { case AOM_BITS_8: #if !CONFIG_REALTIME_ONLY HIGHBD_BFP_WRAPPER(64, 16, 8) @@ -850,7 +850,7 @@ static AOM_INLINE void highbd_set_var_fns(AV1_COMP *const cpi) { default: assert(0 && - "cm->seq_params.bit_depth should be AOM_BITS_8, " + "cm->seq_params->bit_depth should be AOM_BITS_8, " "AOM_BITS_10 or AOM_BITS_12"); } } @@ -873,6 +873,33 @@ static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) { av1_copy(frame_probs->switchable_interp_probs, default_switchable_interp_probs); } + +#if CONFIG_FRAME_PARALLEL_ENCODE + FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs; + if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { + av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs); + } + if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 && + cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) { + av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs); + } + if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) { + av1_copy(temp_frame_probs->warped_probs, default_warped_probs); + } + if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { + av1_copy(temp_frame_probs->switchable_interp_probs, + default_switchable_interp_probs); + } +#endif +} + +static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst, + const CdefInfo *const src) { + dst->cdef_bits = src->cdef_bits; + dst->cdef_damping = src->cdef_damping; + av1_copy(dst->cdef_strengths, src->cdef_strengths); + av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths); + dst->nb_cdef_strengths = src->nb_cdef_strengths; } // Coding context that only needs to be restored when recode loop includes @@ -882,9 +909,9 @@ static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; AV1_COMMON *cm = &cpi->common; cm->lf = cc->lf; - cm->cdef_info = cc->cdef_info; + restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info); cpi->rc = cc->rc; - cpi->mv_stats = cc->mv_stats; + cpi->ppi->mv_stats = cc->mv_stats; } static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a, @@ -964,6 +991,8 @@ static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) { } } +void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi, + const AV1EncoderConfig *oxcf); void av1_update_film_grain_parameters(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf); @@ -972,7 +1001,8 @@ void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter, void av1_setup_frame(AV1_COMP *cpi); -BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi); +BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width, + int height, int number_spatial_layers); void av1_apply_active_map(AV1_COMP *cpi); diff --git a/third_party/libaom/source/libaom/av1/encoder/encodetxb.c b/third_party/libaom/source/libaom/av1/encoder/encodetxb.c index 7b0b281c80..0eb134890e 100644 --- a/third_party/libaom/source/libaom/av1/encoder/encodetxb.c +++ b/third_party/libaom/source/libaom/av1/encoder/encodetxb.c @@ -26,11 +26,11 @@ void av1_alloc_txb_buf(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; - int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) * - ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1); + int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) * + ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1); const int num_planes = av1_num_planes(cm); - const int subsampling_x = cm->seq_params.subsampling_x; - const int subsampling_y = cm->seq_params.subsampling_y; + const int subsampling_x = cm->seq_params->subsampling_x; + const int subsampling_y = cm->seq_params->subsampling_y; const int chroma_max_sb_square = MAX_SB_SQUARE >> (subsampling_x + subsampling_y); const int num_tcoeffs = @@ -624,6 +624,7 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row, const int coeff_ctx = coeff_contexts[pos]; const tran_low_t v = qcoeff[pos]; const tran_low_t level = abs(v); + td->abs_sum_level += level; if (allow_update_cdf) { if (c == eob - 1) { @@ -719,7 +720,7 @@ void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td, CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; - const int mib_size_log2 = cm->seq_params.mib_size_log2; + const int mib_size_log2 = cm->seq_params->mib_size_log2; const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1; const int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); diff --git a/third_party/libaom/source/libaom/av1/encoder/ethread.c b/third_party/libaom/source/libaom/av1/encoder/ethread.c index 3735ca3c8b..d274b6b84f 100644 --- a/third_party/libaom/source/libaom/av1/encoder/ethread.c +++ b/third_party/libaom/source/libaom/av1/encoder/ethread.c @@ -11,9 +11,11 @@ #include "av1/common/warped_motion.h" +#include "av1/encoder/bitstream.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" +#include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/ethread.h" #if !CONFIG_REALTIME_ONLY #include "av1/encoder/firstpass.h" @@ -52,7 +54,7 @@ static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &cpi->td.mb.e_mbd; - const int mib_size = cm->seq_params.mib_size; + const int mib_size = cm->seq_params->mib_size; const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; for (int row = 0; row < cm->tiles.rows; row++) { @@ -68,7 +70,8 @@ static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) { const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col; MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str; MB_MODE_INFO *mbmi = mi[0]; - if (mbmi->skip_txfm == 1 && (mbmi->bsize == cm->seq_params.sb_size)) { + if (mbmi->skip_txfm == 1 && + (mbmi->bsize == cm->seq_params->sb_size)) { for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id]; mbmi->delta_lf_from_base = xd->delta_lf_from_base; @@ -362,7 +365,7 @@ static AOM_INLINE void switch_tile_and_get_next_job( *cur_tile_id = tile_id; const int unit_height = mi_size_high[fp_block_size]; get_next_job(&tile_data[tile_id], current_mi_row, - is_firstpass ? unit_height : cm->seq_params.mib_size); + is_firstpass ? unit_height : cm->seq_params->mib_size); } } @@ -441,13 +444,20 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { const BLOCK_SIZE fp_block_size = cpi->fp_block_size; int end_of_frame = 0; + + // When master thread does not have a valid job to process, xd->tile_ctx + // is not set and it contains NULL pointer. This can result in NULL pointer + // access violation if accessed beyond the encode stage. Hence, updating + // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame + // context to avoid NULL pointer access in subsequent stages. + thread_data->td->mb.e_mbd.tile_ctx = cm->fc; while (1) { int current_mi_row = -1; #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); #endif if (!get_next_job(&cpi->tile_data[cur_tile_id], ¤t_mi_row, - cm->seq_params.mib_size)) { + cm->seq_params->mib_size)) { // No jobs are available for the current tile. Query for the status of // other tiles and get the next job if available switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id, @@ -470,6 +480,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { td->mb.e_mbd.tile_ctx = td->tctx; td->mb.tile_pb_ctx = &this_tile->tctx; + td->abs_sum_level = 0; if (this_tile->allow_update_cdf) { td->mb.row_ctx = this_tile->row_ctx; @@ -482,7 +493,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row, &td->mb.e_mbd); - cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params); + cfl_init(&td->mb.e_mbd.cfl, cm->seq_params); if (td->mb.txfm_search_info.txb_rd_records != NULL) { av1_crc32c_calculator_init( &td->mb.txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator); @@ -492,6 +503,7 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { #if CONFIG_MULTITHREAD pthread_mutex_lock(enc_row_mt_mutex_); #endif + this_tile->abs_sum_level += td->abs_sum_level; row_mt_sync->num_threads_working--; #if CONFIG_MULTITHREAD pthread_mutex_unlock(enc_row_mt_mutex_); @@ -526,16 +538,12 @@ static int enc_worker_hook(void *arg1, void *unused) { return 1; } -void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) { +#if CONFIG_MULTITHREAD +void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) { AV1_COMMON *const cm = &cpi->common; - const AVxWorkerInterface *const winterface = aom_get_worker_interface(); MultiThreadInfo *const mt_info = &cpi->mt_info; - assert(mt_info->workers != NULL); - assert(mt_info->tile_thr_data != NULL); - -#if CONFIG_MULTITHREAD - if (cpi->oxcf.row_mt == 1) { + if (is_first_pass || cpi->oxcf.row_mt == 1) { AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt; if (enc_row_mt->mutex_ == NULL) { CHECK_MEM_ERROR(cm, enc_row_mt->mutex_, @@ -543,24 +551,39 @@ void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) { if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL); } } - AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync; - if (gm_sync->mutex_ == NULL) { - CHECK_MEM_ERROR(cm, gm_sync->mutex_, - aom_malloc(sizeof(*(gm_sync->mutex_)))); - if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL); - } - AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync; - if (tf_sync->mutex_ == NULL) { - CHECK_MEM_ERROR(cm, tf_sync->mutex_, aom_malloc(sizeof(*tf_sync->mutex_))); - if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL); - } - AV1CdefSync *cdef_sync = &mt_info->cdef_sync; - if (cdef_sync->mutex_ == NULL) { - CHECK_MEM_ERROR(cm, cdef_sync->mutex_, - aom_malloc(sizeof(*(cdef_sync->mutex_)))); - if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + + if (!is_first_pass) { + AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync; + if (gm_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, gm_sync->mutex_, + aom_malloc(sizeof(*(gm_sync->mutex_)))); + if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL); + } +#if !CONFIG_REALTIME_ONLY + AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync; + if (tf_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, tf_sync->mutex_, + aom_malloc(sizeof(*tf_sync->mutex_))); + if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL); + } +#endif // !CONFIG_REALTIME_ONLY + AV1CdefSync *cdef_sync = &mt_info->cdef_sync; + if (cdef_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, cdef_sync->mutex_, + aom_malloc(sizeof(*(cdef_sync->mutex_)))); + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); + } } -#endif +} +#endif // CONFIG_MULTITHREAD + +void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) { + AV1_COMMON *const cm = &cpi->common; + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + MultiThreadInfo *const mt_info = &cpi->mt_info; + + assert(mt_info->workers != NULL); + assert(mt_info->tile_thr_data != NULL); for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &mt_info->workers[i]; @@ -576,7 +599,7 @@ void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) { // Create threads if (!winterface->reset(worker)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Tile encoder thread creation failed"); } else { // Main thread acts as a worker and uses the thread data in cpi. @@ -625,10 +648,6 @@ static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) { alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer); - CHECK_MEM_ERROR( - cm, thread_data->td->tmp_conv_dst, - aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * - sizeof(*thread_data->td->tmp_conv_dst))); for (int j = 0; j < 2; ++j) { CHECK_MEM_ERROR( cm, thread_data->td->tmp_pred_bufs[j], @@ -636,9 +655,14 @@ static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) { sizeof(*thread_data->td->tmp_pred_bufs[j]))); } + const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome; + CHECK_MEM_ERROR(cm, thread_data->td->pixel_gradient_info, + aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) * + plane_types * MAX_SB_SQUARE)); + if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { const int num_64x64_blocks = - (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; + (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4; CHECK_MEM_ERROR( cm, thread_data->td->vt64x64, aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks)); @@ -680,6 +704,10 @@ void av1_create_workers(AV1_COMP *cpi, int num_workers) { // Set up shared coeff buffers. av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf); + CHECK_MEM_ERROR( + cm, thread_data->td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*thread_data->td->tmp_conv_dst))); } ++mt_info->num_workers; } @@ -724,7 +752,7 @@ static AOM_INLINE void fp_create_enc_workers(AV1_COMP *cpi, int num_workers) { if (create_workers) { // Create threads if (!winterface->reset(worker)) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Tile encoder thread creation failed"); } } else { @@ -764,7 +792,7 @@ static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info, } if (had_error) - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Failed to encode tile data"); } @@ -780,14 +808,15 @@ static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi, !frame_is_intra_only(&cpi->common)) av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh, &thread_data->td->mb); - if (thread_data->td->mb.txfm_search_info.txb_rd_records) { - aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records); - thread_data->td->mb.txfm_search_info.txb_rd_records = NULL; - } - if (thread_data->td != &cpi->td && - cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) { - aom_free(thread_data->td->mb.mv_costs); + if (thread_data->td != &cpi->td) { + if (cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) { + aom_free(thread_data->td->mb.mv_costs); + } + if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) { + aom_free(thread_data->td->mb.dv_costs); + } } + av1_dealloc_mb_data(&cpi->common, &thread_data->td->mb); // Accumulate counters. if (i > 0) { @@ -822,6 +851,7 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, thread_data->td->intrabc_used = 0; thread_data->td->deltaq_used = 0; + thread_data->td->abs_sum_level = 0; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { @@ -846,15 +876,19 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs, sizeof(MvCosts)); } + if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) { + CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs, + (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts))); + memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs, + sizeof(IntraBCMVCosts)); + } } + av1_alloc_mb_data(cm, &thread_data->td->mb, + cpi->sf.rt_sf.use_nonrd_pick_mode); + // Reset cyclic refresh counters. av1_init_cyclic_refresh_counters(&thread_data->td->mb); - if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { - CHECK_MEM_ERROR(cm, thread_data->td->mb.txfm_search_info.txb_rd_records, - (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords))); - } - if (thread_data->td->counts != &cpi->counts) { memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); } @@ -867,6 +901,8 @@ static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, thread_data->td->mb.tmp_pred_bufs[j] = thread_data->td->tmp_pred_bufs[j]; } + thread_data->td->mb.pixel_gradient_info = + thread_data->td->pixel_gradient_info; thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; for (int j = 0; j < 2; ++j) { @@ -904,11 +940,16 @@ static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs, sizeof(MvCosts)); } + if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) { + CHECK_MEM_ERROR(cm, thread_data->td->mb.dv_costs, + (IntraBCMVCosts *)aom_malloc(sizeof(IntraBCMVCosts))); + memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs, + sizeof(IntraBCMVCosts)); + } } - if (!cpi->sf.rt_sf.use_nonrd_pick_mode) { - CHECK_MEM_ERROR(cm, thread_data->td->mb.txfm_search_info.txb_rd_records, - (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords))); - } + + av1_alloc_mb_data(cm, &thread_data->td->mb, + cpi->sf.rt_sf.use_nonrd_pick_mode); } } #endif @@ -1191,13 +1232,15 @@ void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) { sync_enc_workers(&cpi->mt_info, cm, num_workers); for (int i = num_workers - 1; i >= 0; i--) { EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i]; - if (thread_data->td != &cpi->td && - cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) { - aom_free(thread_data->td->mb.mv_costs); - } - if (thread_data->td->mb.txfm_search_info.txb_rd_records) { - aom_free(thread_data->td->mb.txfm_search_info.txb_rd_records); + if (thread_data->td != &cpi->td) { + if (cpi->oxcf.cost_upd_freq.mv < COST_UPD_OFF) { + aom_free(thread_data->td->mb.mv_costs); + } + if (cpi->oxcf.cost_upd_freq.dv < COST_UPD_OFF) { + aom_free(thread_data->td->mb.dv_costs); + } } + av1_dealloc_mb_data(cm, &thread_data->td->mb); } } @@ -1277,11 +1320,15 @@ static int tpl_worker_hook(void *arg1, void *unused) { AV1_COMMON *cm = &cpi->common; MACROBLOCK *x = &thread_data->td->mb; MACROBLOCKD *xd = &x->e_mbd; + TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats; CommonModeInfoParams *mi_params = &cm->mi_params; - BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d); + BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); TX_SIZE tx_size = max_txsize_lookup[bsize]; int mi_height = mi_size_high[bsize]; - int num_active_workers = cpi->tpl_data.tpl_mt_sync.num_threads_working; + int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working; + + av1_init_tpl_txfm_stats(tpl_txfm_stats); + for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows; mi_row += num_active_workers * mi_height) { // Motion estimation row boundary @@ -1290,7 +1337,7 @@ static int tpl_worker_hook(void *arg1, void *unused) { xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); - av1_mc_flow_dispenser_row(cpi, x, mi_row, bsize, tx_size); + av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, x, mi_row, bsize, tx_size); } return 1; } @@ -1370,6 +1417,24 @@ static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook, // OBMC buffers are used only to init MS params and remain unused when // called from tpl, hence set the buffers to defaults. av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer); + thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; + thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst; + } + } +} + +// Accumulate transform stats after tpl. +static void tpl_accumulate_txfm_stats(ThreadData *main_td, + const MultiThreadInfo *mt_info, + int num_workers) { + TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *const worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + ThreadData *td = thread_data->td; + if (td != main_td) { + const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; + av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats); } } } @@ -1379,7 +1444,7 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; CommonModeInfoParams *mi_params = &cm->mi_params; MultiThreadInfo *mt_info = &cpi->mt_info; - TplParams *tpl_data = &cpi->tpl_data; + TplParams *tpl_data = &cpi->ppi->tpl_data; AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync; int mb_rows = mi_params->mb_rows; int num_workers = @@ -1398,6 +1463,7 @@ void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) { prepare_tpl_workers(cpi, tpl_worker_hook, num_workers); launch_workers(&cpi->mt_info, num_workers); sync_enc_workers(&cpi->mt_info, cm, num_workers); + tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers); } // Deallocate memory for temporal filter multi-thread synchronization. @@ -1752,6 +1818,331 @@ void av1_global_motion_estimation_mt(AV1_COMP *cpi) { } #endif // !CONFIG_REALTIME_ONLY +// Compare and order tiles based on absolute sum of tx coeffs. +static int compare_tile_order(const void *a, const void *b) { + const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a; + const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b; + + if (tile_a->abs_sum_level > tile_b->abs_sum_level) + return -1; + else if (tile_a->abs_sum_level == tile_b->abs_sum_level) + return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1); + else + return 1; +} + +// Get next tile index to be processed for pack bitstream +static AOM_INLINE int get_next_pack_bs_tile_idx( + AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) { + assert(pack_bs_sync->next_job_idx <= num_tiles); + if (pack_bs_sync->next_job_idx == num_tiles) return -1; + + return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++] + .tile_idx; +} + +// Calculates bitstream chunk size based on total buffer size and tile or tile +// group size. +static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size, + const int frame_or_tg_size, + size_t *remain_buf_size, + size_t max_buf_size, + int is_last_chunk) { + size_t this_chunk_size; + assert(*remain_buf_size > 0); + if (is_last_chunk) { + this_chunk_size = *remain_buf_size; + *remain_buf_size = 0; + } else { + const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size; + this_chunk_size = (size_t)(size_scale / frame_or_tg_size); + *remain_buf_size -= this_chunk_size; + assert(*remain_buf_size > 0); + } + assert(this_chunk_size > 0); + return this_chunk_size; +} + +// Initializes params required for pack bitstream tile. +static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst, + struct aom_write_bit_buffer *saved_wb, + PackBSParams *const pack_bs_params_arr, + uint8_t obu_extn_header) { + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + const int num_tiles = tiles->cols * tiles->rows; + // Fixed size tile groups for the moment + const int num_tg_hdrs = cpi->num_tg; + // Tile group size in terms of number of tiles. + const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs; + uint8_t *tile_dst = dst; + uint8_t *tile_data_curr = dst; + // Max tile group count can not be more than MAX_TILES. + int tg_size_mi[MAX_TILES] = { 0 }; // Size of tile group in mi units + int tile_idx; + int tg_idx = 0; + int tile_count_in_tg = 0; + int new_tg = 1; + + // Populate pack bitstream params of all tiles. + for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info; + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + // Calculate tile size in mi units. + const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) * + (tile_info->mi_row_end - tile_info->mi_row_start); + int is_last_tile_in_tg = 0; + tile_count_in_tg++; + if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1)) + is_last_tile_in_tg = 1; + + // Populate pack bitstream params of this tile. + pack_bs_params->curr_tg_hdr_size = 0; + pack_bs_params->obu_extn_header = obu_extn_header; + pack_bs_params->saved_wb = saved_wb; + pack_bs_params->obu_header_size = 0; + pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg; + pack_bs_params->new_tg = new_tg; + pack_bs_params->tile_col = tile_info->tile_col; + pack_bs_params->tile_row = tile_info->tile_row; + pack_bs_params->tile_size_mi = tile_size_mi; + tg_size_mi[tg_idx] += tile_size_mi; + + if (new_tg) new_tg = 0; + if (is_last_tile_in_tg) { + tile_count_in_tg = 0; + new_tg = 1; + tg_idx++; + } + } + + assert(cpi->available_bs_size > 0); + size_t tg_buf_size[MAX_TILES] = { 0 }; + size_t max_buf_size = cpi->available_bs_size; + size_t remain_buf_size = max_buf_size; + const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols; + + tile_idx = 0; + // Prepare obu, tile group and frame header of each tile group. + for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) { + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + int is_last_tg = tg_idx == cpi->num_tg - 1; + // Prorate bitstream buffer size based on tile group size and available + // buffer size. This buffer will be used to store headers and tile data. + tg_buf_size[tg_idx] = + get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size, + max_buf_size, is_last_tg); + + pack_bs_params->dst = tile_dst; + pack_bs_params->tile_data_curr = tile_dst; + + // Write obu, tile group and frame header at first tile in the tile + // group. + av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx); + tile_dst += tg_buf_size[tg_idx]; + + // Exclude headers from tile group buffer size. + tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size; + tile_idx += tg_size_in_tiles; + } + + tg_idx = 0; + // Calculate bitstream buffer size of each tile in the tile group. + for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + + if (pack_bs_params->new_tg) { + max_buf_size = tg_buf_size[tg_idx]; + remain_buf_size = max_buf_size; + } + + // Prorate bitstream buffer size of this tile based on tile size and + // available buffer size. For this proration, header size is not accounted. + const size_t tile_buf_size = get_bs_chunk_size( + pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size, + max_buf_size, pack_bs_params->is_last_tile_in_tg); + pack_bs_params->tile_buf_size = tile_buf_size; + + // Update base address of bitstream buffer for tile and tile group. + if (pack_bs_params->new_tg) { + tile_dst = pack_bs_params->dst; + tile_data_curr = pack_bs_params->tile_data_curr; + // Account header size in first tile of a tile group. + pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size; + } else { + pack_bs_params->dst = tile_dst; + pack_bs_params->tile_data_curr = tile_data_curr; + } + + if (pack_bs_params->is_last_tile_in_tg) tg_idx++; + tile_dst += pack_bs_params->tile_buf_size; + } +} + +// Worker hook function of pack bitsteam multithreading. +static int pack_bs_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + PackBSParams *const pack_bs_params = (PackBSParams *)arg2; + AV1_COMP *const cpi = thread_data->cpi; + AV1_COMMON *const cm = &cpi->common; + AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync; + const CommonTileParams *const tiles = &cm->tiles; + const int num_tiles = tiles->cols * tiles->rows; + + while (1) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pack_bs_sync->mutex_); +#endif + const int tile_idx = get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles); +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pack_bs_sync->mutex_); +#endif + if (tile_idx == -1) break; + TileDataEnc *this_tile = &cpi->tile_data[tile_idx]; + thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; + + av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]); + } + + return 1; +} + +// Prepares thread data and workers of pack bitsteam multithreading. +static void prepare_pack_bs_workers(AV1_COMP *const cpi, + PackBSParams *const pack_bs_params, + AVxWorkerHook hook, const int num_workers) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int i = num_workers - 1; i >= 0; i--) { + AVxWorker *worker = &mt_info->workers[i]; + EncWorkerData *const thread_data = &mt_info->tile_thr_data[i]; + if (i == 0) thread_data->td = &cpi->td; + + if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb; + + thread_data->cpi = cpi; + thread_data->start = i; + thread_data->thread_id = i; + av1_reset_pack_bs_thread_data(thread_data->td); + + worker->hook = hook; + worker->data1 = thread_data; + worker->data2 = pack_bs_params; + } + + AV1_COMMON *const cm = &cpi->common; + AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync; + const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols; +#if CONFIG_MULTITHREAD + if (pack_bs_sync->mutex_ == NULL) { + CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_, + aom_malloc(sizeof(*pack_bs_sync->mutex_))); + if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL); + } +#endif + pack_bs_sync->next_job_idx = 0; + + PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order; + // Reset tile order data of pack bitstream + av1_zero_array(pack_bs_tile_order, num_tiles); + + // Populate pack bitstream tile order structure + for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) { + pack_bs_tile_order[tile_idx].abs_sum_level = + cpi->tile_data[tile_idx].abs_sum_level; + pack_bs_tile_order[tile_idx].tile_idx = tile_idx; + } + + // Sort tiles in descending order based on tile area. + qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order), + compare_tile_order); +} + +// Accumulates data after pack bitsteam processing. +static void accumulate_pack_bs_data( + AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr, + uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info, + int *const largest_tile_id, unsigned int *max_tile_size, + uint32_t *const obu_header_size, uint8_t **tile_data_start, + const int num_workers) { + const AV1_COMMON *const cm = &cpi->common; + const CommonTileParams *const tiles = &cm->tiles; + const int tile_count = tiles->cols * tiles->rows; + // Fixed size tile groups for the moment + size_t curr_tg_data_size = 0; + int is_first_tg = 1; + uint8_t *curr_tg_start = dst; + size_t src_offset = 0; + size_t dst_offset = 0; + + for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) { + // PackBSParams stores all parameters required to pack tile and header + // info. + const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx]; + uint32_t tile_size = 0; + + if (pack_bs_params->new_tg) { + curr_tg_start = dst + *total_size; + curr_tg_data_size = pack_bs_params->curr_tg_hdr_size; + *tile_data_start += pack_bs_params->curr_tg_hdr_size; + *obu_header_size = pack_bs_params->obu_header_size; + } + curr_tg_data_size += + pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4); + + if (pack_bs_params->buf.size > *max_tile_size) { + *largest_tile_id = tile_idx; + *max_tile_size = (unsigned int)pack_bs_params->buf.size; + } + tile_size += + (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size; + + // Pack all the chunks of tile bitstreams together + if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size); + + if (pack_bs_params->is_last_tile_in_tg) + av1_write_last_tile_info( + cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size, + curr_tg_start, &tile_size, tile_data_start, largest_tile_id, + &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header); + src_offset += pack_bs_params->tile_buf_size; + dst_offset += tile_size; + *total_size += tile_size; + } + + // Accumulate thread data + MultiThreadInfo *const mt_info = &cpi->mt_info; + for (int idx = num_workers - 1; idx >= 0; idx--) { + ThreadData const *td = mt_info->tile_thr_data[idx].td; + av1_accumulate_pack_bs_thread_data(cpi, td); + } +} + +void av1_write_tile_obu_mt( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start) { + MultiThreadInfo *const mt_info = &cpi->mt_info; + const int num_workers = mt_info->num_mod_workers[MOD_PACK_BS]; + + PackBSParams pack_bs_params[MAX_TILES]; + uint32_t tile_size[MAX_TILES] = { 0 }; + + for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++) + pack_bs_params[tile_idx].total_size = &tile_size[tile_idx]; + + init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header); + prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook, + num_workers); + launch_workers(mt_info, num_workers); + sync_enc_workers(mt_info, &cpi->common, num_workers); + accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info, + largest_tile_id, max_tile_size, obu_header_size, + tile_data_start, num_workers); +} + // Deallocate memory for CDEF search multi-thread synchronization. void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) { (void)cdef_sync; @@ -1780,6 +2171,9 @@ static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) { // Initializes cdef_sync parameters. static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) { +#if CONFIG_MULTITHREAD + if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL); +#endif // CONFIG_MULTITHREAD cdef_sync->end_of_frame = 0; cdef_sync->fbr = 0; cdef_sync->fbc = 0; @@ -1896,6 +2290,12 @@ static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) { return compute_num_enc_workers(cpi, cpi->oxcf.max_threads); } +// Computes num_workers for pack bitstream multi-threading. +static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) { + if (cpi->oxcf.max_threads <= 1) return 1; + return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads); +} + int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) { int num_mod_workers = 0; switch (mod_name) { @@ -1915,7 +2315,9 @@ int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) { case MOD_CDEF_SEARCH: num_mod_workers = compute_num_cdef_workers(cpi); break; + case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break; case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break; + case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break; default: assert(0); break; } return (num_mod_workers); diff --git a/third_party/libaom/source/libaom/av1/encoder/ethread.h b/third_party/libaom/source/libaom/av1/encoder/ethread.h index 55e7f7be39..c2ab864690 100644 --- a/third_party/libaom/source/libaom/av1/encoder/ethread.h +++ b/third_party/libaom/source/libaom/av1/encoder/ethread.h @@ -80,6 +80,10 @@ int av1_get_max_num_workers(AV1_COMP *cpi); void av1_create_workers(AV1_COMP *cpi, int num_workers); +#if CONFIG_MULTITHREAD +void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass); +#endif // CONFIG_MULTITHREAD + void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers); void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info, @@ -87,6 +91,13 @@ void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info, void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync); +void av1_write_tile_obu_mt( + AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size, + struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header, + const FrameHeaderInfo *fh_info, int *const largest_tile_id, + unsigned int *max_tile_size, uint32_t *const obu_header_size, + uint8_t **tile_data_start); + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/libaom/source/libaom/av1/encoder/external_partition.c b/third_party/libaom/source/libaom/av1/encoder/external_partition.c new file mode 100644 index 0000000000..542b2bb878 --- /dev/null +++ b/third_party/libaom/source/libaom/av1/encoder/external_partition.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/common.h" +#include "av1/encoder/external_partition.h" + +aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs, + aom_ext_part_config_t config, + ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + ext_part_controller->funcs = funcs; + ext_part_controller->config = config; + const aom_ext_part_status_t status = ext_part_controller->funcs.create_model( + ext_part_controller->funcs.priv, &ext_part_controller->config, + &ext_part_controller->model); + if (status == AOM_EXT_PART_ERROR) { + return AOM_CODEC_ERROR; + } else if (status == AOM_EXT_PART_TEST) { + ext_part_controller->test_mode = 1; + ext_part_controller->ready = 0; + return AOM_CODEC_OK; + } + assert(status == AOM_EXT_PART_OK); + ext_part_controller->ready = 1; + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + av1_zero(ext_part_controller); + return AOM_CODEC_OK; +} + +aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) { + if (ext_part_controller == NULL) { + return AOM_CODEC_INVALID_PARAM; + } + if (ext_part_controller->ready) { + const aom_ext_part_status_t status = + ext_part_controller->funcs.delete_model(ext_part_controller->model); + if (status != AOM_EXT_PART_OK) { + return AOM_CODEC_ERROR; + } + } + return av1_ext_part_init(ext_part_controller); +} + +bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller, + aom_partition_decision_t *decision) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(decision != NULL); + const aom_ext_part_status_t status = + ext_part_controller->funcs.get_partition_decision( + ext_part_controller->model, decision); + if (status != AOM_EXT_PART_OK) return false; + return true; +} + +bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller, + const aom_partition_stats_t *stats) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(stats != NULL); + const aom_ext_part_status_t status = + ext_part_controller->funcs.send_partition_stats( + ext_part_controller->model, stats); + if (status != AOM_EXT_PART_OK) return false; + return true; +} + +bool av1_ext_part_send_features(ExtPartController *ext_part_controller, + const aom_partition_features_t *features) { + assert(ext_part_controller != NULL); + assert(ext_part_controller->ready); + assert(features != NULL); + const aom_ext_part_status_t status = ext_part_controller->funcs.send_features( + ext_part_controller->model, features); + if (status != AOM_EXT_PART_OK) return false; + return true; +} diff --git a/third_party/libaom/source/libaom/av1/encoder/external_partition.h b/third_party/libaom/source/libaom/av1/encoder/external_partition.h new file mode 100644 index 0000000000..20f03ed752 --- /dev/null +++ b/third_party/libaom/source/libaom/av1/encoder/external_partition.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ +#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ + +#include <stdbool.h> + +#include "aom/aom_codec.h" +#include "aom/aom_external_partition.h" + +#ifdef __cplusplus +extern "C" { +#endif +/*!\cond */ + +typedef struct ExtPartController { + int ready; + int test_mode; + aom_ext_part_config_t config; + aom_ext_part_model_t model; + aom_ext_part_funcs_t funcs; +} ExtPartController; + +aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs, + aom_ext_part_config_t config, + ExtPartController *ext_part_controller); + +aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller); + +aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller); + +bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller, + aom_partition_decision_t *decision); + +bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller, + const aom_partition_stats_t *stats); + +bool av1_ext_part_send_features(ExtPartController *ext_part_controller, + const aom_partition_features_t *features); + +/*!\endcond */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_ diff --git a/third_party/libaom/source/libaom/av1/encoder/firstpass.c b/third_party/libaom/source/libaom/av1/encoder/firstpass.c index ff6814d04c..662b42c822 100644 --- a/third_party/libaom/source/libaom/av1/encoder/firstpass.c +++ b/third_party/libaom/source/libaom/av1/encoder/firstpass.c @@ -27,6 +27,7 @@ #include "av1/common/entropymv.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" // av1_setup_dst_planes() +#include "av1/common/reconintra.h" #include "av1/common/txb_common.h" #include "av1/encoder/aq_variance.h" #include "av1/encoder/av1_quantize.h" @@ -54,6 +55,8 @@ #define NCOUNT_INTRA_THRESH 8192 #define NCOUNT_INTRA_FACTOR 3 +#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1 + static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats, struct aom_codec_pkt_list *pktlist) { struct aom_codec_cx_pkt pkt; @@ -108,6 +111,9 @@ void av1_twopass_zero_stats(FIRSTPASS_STATS *section) { section->new_mv_count = 0.0; section->count = 0.0; section->duration = 1.0; + section->is_flash = 0; + section->noise_var = 0; + section->cor_coeff = 1.0; } void av1_accumulate_stats(FIRSTPASS_STATS *section, @@ -118,9 +124,11 @@ void av1_accumulate_stats(FIRSTPASS_STATS *section, section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy; section->coded_error += frame->coded_error; section->sr_coded_error += frame->sr_coded_error; + section->tr_coded_error += frame->tr_coded_error; section->pcnt_inter += frame->pcnt_inter; section->pcnt_motion += frame->pcnt_motion; section->pcnt_second_ref += frame->pcnt_second_ref; + section->pcnt_third_ref += frame->pcnt_third_ref; section->pcnt_neutral += frame->pcnt_neutral; section->intra_skip_pct += frame->intra_skip_pct; section->inactive_zone_rows += frame->inactive_zone_rows; @@ -177,8 +185,9 @@ static int get_num_mbs(const BLOCK_SIZE fp_block_size, } void av1_end_first_pass(AV1_COMP *cpi) { - if (cpi->twopass.stats_buf_ctx->total_stats) - output_stats(cpi->twopass.stats_buf_ctx->total_stats, cpi->output_pkt_list); + if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled) + output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, + cpi->ppi->output_pkt_list); } static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { @@ -261,15 +270,12 @@ static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = xd->mi[0]->bsize; const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; const int sr = get_search_range(&cpi->initial_dimensions); - const int step_param = 3 + sr; + const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr; const search_site_config *first_pass_search_sites = cpi->mv_search_params.search_site_cfg[SS_CFG_FPF]; const int fine_search_interval = cpi->is_screen_content_type && cpi->common.features.allow_intrabc; - if (fine_search_interval) { - av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); - } FULLPEL_MOTION_SEARCH_PARAMS ms_params; av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv, first_pass_search_sites, @@ -281,7 +287,7 @@ static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, &this_best_mv, NULL); if (tmp_err < INT_MAX) { - aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; + aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize]; const MSBuffers *ms_buffers = &ms_params.ms_buffers; tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv, &v_fn_ptr, ms_buffers->src, ms_buffers->ref) + @@ -355,6 +361,86 @@ static double raw_motion_error_stdev(int *raw_motion_err_list, return raw_err_stdev; } +static AOM_INLINE int do_third_ref_motion_search(const RateControlCfg *rc_cfg, + const GFConfig *gf_cfg) { + return use_ml_model_to_decide_flat_gop(rc_cfg) && can_disable_altref(gf_cfg); +} + +static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) { + return (use_ml_model_to_decide_flat_gop(&oxcf->rc_cfg) && + can_disable_altref(&oxcf->gf_cfg)) || + (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL); +} +typedef struct intra_pred_block_pass1_args { + const SequenceHeader *seq_params; + MACROBLOCK *x; +} intra_pred_block_pass1_args; + +static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src, + int sstride, int width, int height, int use_hbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (use_hbd) { + aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride, + CONVERT_TO_SHORTPTR(dst), dstride, width, height); + } else { + aom_convolve_copy(src, sstride, dst, dstride, width, height); + } +#else + (void)use_hbd; + aom_convolve_copy(src, sstride, dst, dstride, width, height); +#endif +} + +static void first_pass_intra_pred_and_calc_diff(int plane, int block, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + (void)block; + struct intra_pred_block_pass1_args *const args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + MACROBLOCK_PLANE *const p = &x->plane[plane]; + const int dst_stride = pd->dst.stride; + uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const SequenceHeader *seq_params = args->seq_params; + const int src_stride = p->src.stride; + uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2]; + + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width, + pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src, + src_stride, dst, dst_stride, blk_col, blk_row, plane); + + av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); +} + +static void first_pass_predict_intra_block_for_luma_plane( + const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) { + assert(bsize < BLOCK_SIZES_ALL); + const MACROBLOCKD *const xd = &x->e_mbd; + const int plane = AOM_PLANE_Y; + const MACROBLOCKD_PLANE *const pd = &xd->plane[plane]; + const int ss_x = pd->subsampling_x; + const int ss_y = pd->subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y); + const int dst_stride = pd->dst.stride; + uint8_t *dst = pd->dst.buf; + const MACROBLOCK_PLANE *const p = &x->plane[plane]; + const int src_stride = p->src.stride; + const uint8_t *src = p->src.buf; + + intra_pred_block_pass1_args args = { seq_params, x }; + av1_foreach_transformed_block_in_plane( + xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args); + + // copy source data to recon buffer, as the recon buffer will be used as a + // reference frame subsequently. + copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize], + block_size_high[bsize], seq_params->use_highbitdepth); +} + #define UL_INTRA_THRESH 50 #define INVALID_ROW -1 // Computes and returns the intra pred error of a block. @@ -388,11 +474,10 @@ static int firstpass_intra_prediction( const int qindex, FRAME_STATS *const stats) { const AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; const int unit_scale = mi_size_wide[fp_block_size]; - const int use_dc_pred = (unit_col || unit_row) && (!unit_col || !unit_row); const int num_planes = av1_num_planes(cm); const BLOCK_SIZE bsize = get_bsize(mi_params, fp_block_size, unit_row, unit_col); @@ -412,9 +497,12 @@ static int firstpass_intra_prediction( xd->mi[0]->segment_id = 0; xd->lossless[xd->mi[0]->segment_id] = (qindex == 0); xd->mi[0]->mode = DC_PRED; - xd->mi[0]->tx_size = use_dc_pred ? max_txsize_lookup[bsize] : TX_4X4; + xd->mi[0]->tx_size = TX_4X4; - av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0); + if (cpi->sf.fp_sf.disable_recon) + first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize); + else + av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0); int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff); if (seq_params->use_highbitdepth) { switch (seq_params->bit_depth) { @@ -480,16 +568,22 @@ static int firstpass_intra_prediction( // Accumulate the intra error. stats->intra_error += (int64_t)this_intra_error; - const int hbd = is_cur_buf_hbd(xd); - const int stride = x->plane[0].src.stride; - const int num_8x8_rows = block_size_high[fp_block_size] / 8; - const int num_8x8_cols = block_size_wide[fp_block_size] / 8; - const uint8_t *buf = x->plane[0].src.buf; - for (int r8 = 0; r8 < num_8x8_rows; ++r8) { - for (int c8 = 0; c8 < num_8x8_cols; ++c8) { - stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input( - buf + c8 * 8 + r8 * 8 * stride, stride, hbd); - } + // Stats based on wavelet energy is used in the following cases : + // 1. ML model which predicts if a flat structure (golden-frame only structure + // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in + // constant quality mode under certain conditions. + // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL. + // Thus, wavelet energy calculation is enabled for the above cases. + if (calc_wavelet_energy(&cpi->oxcf)) { + const int hbd = is_cur_buf_hbd(xd); + const int stride = x->plane[0].src.stride; + const int num_8x8_rows = block_size_high[fp_block_size] / 8; + const int num_8x8_cols = block_size_wide[fp_block_size] / 8; + const uint8_t *buf = x->plane[0].src.buf; + stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input( + buf, stride, hbd, num_8x8_rows, num_8x8_cols); + } else { + stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP; } return this_intra_error; @@ -516,13 +610,13 @@ static int get_prediction_error_bitdepth(const int is_high_bitdepth, static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv, const int mb_row, const int mb_col, const int mb_rows, const int mb_cols, - MV *last_mv, FRAME_STATS *stats) { + MV *last_non_zero_mv, FRAME_STATS *stats) { if (is_zero_mv(&best_mv)) return; ++stats->mv_count; // Non-zero vector, was it different from the last non zero vector? - if (!is_equal_mv(&best_mv, last_mv)) ++stats->new_mv_count; - *last_mv = best_mv; + if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count; + *last_non_zero_mv = best_mv; // Does the row vector point inwards or outwards? if (mb_row < mb_rows / 2) { @@ -555,7 +649,6 @@ static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv, } } -#define LOW_MOTION_ERROR_THRESH 25 // Computes and returns the inter prediction error from the last frame. // Computes inter prediction errors from the golden and alt ref frams and // Updates stats accordingly. @@ -576,8 +669,9 @@ static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv, // this_intra_error: the intra prediction error of this block. // raw_motion_err_counts: the count of raw motion vectors. // raw_motion_err_list: the array that records the raw motion error. -// best_ref_mv: best reference mv found so far. -// last_mv: last mv. +// ref_mv: the reference used to start the motion search +// best_mv: the best mv found +// last_non_zero_mv: the last non zero mv found in this tile row. // stats: frame encoding stats. // Modifies: // raw_motion_err_list @@ -593,8 +687,8 @@ static int firstpass_inter_prediction( const int unit_col, const int recon_yoffset, const int recon_uvoffset, const int src_yoffset, const int alt_ref_frame_yoffset, const BLOCK_SIZE fp_block_size, const int this_intra_error, - const int raw_motion_err_counts, int *raw_motion_err_list, MV *best_ref_mv, - MV *last_mv, FRAME_STATS *stats) { + const int raw_motion_err_counts, int *raw_motion_err_list, const MV ref_mv, + MV *best_mv, MV *last_non_zero_mv, FRAME_STATS *stats) { int this_inter_error = this_intra_error; AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; @@ -612,7 +706,6 @@ static int firstpass_inter_prediction( const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols); // Assume 0,0 motion with no mv overhead. FULLPEL_MV mv = kZeroFullMv; - FULLPEL_MV tmp_mv = kZeroFullMv; xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset; // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. @@ -636,15 +729,15 @@ static int firstpass_inter_prediction( &unscaled_last_source_buf_2d); raw_motion_err_list[raw_motion_err_counts] = raw_motion_error; - // TODO(pengchong): Replace the hard-coded threshold - if (raw_motion_error > LOW_MOTION_ERROR_THRESH || cpi->oxcf.speed <= 2) { + if (raw_motion_error > cpi->sf.fp_sf.skip_motion_search_threshold) { // Test last reference frame using the previous best mv as the // starting point (best reference) for the search. - first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error); + first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error); // If the current best reference mv is not centered on 0,0 then do a // 0,0 based search as well. - if (!is_zero_mv(best_ref_mv)) { + if (!is_zero_mv(&ref_mv)) { + FULLPEL_MV tmp_mv = kZeroFullMv; int tmp_err = INT_MAX; first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err); @@ -657,6 +750,7 @@ static int firstpass_inter_prediction( // Motion search in 2nd reference frame. int gf_motion_error = motion_error; if ((current_frame->frame_number > 1) && golden_frame != NULL) { + FULLPEL_MV tmp_mv = kZeroFullMv; // Assume 0,0 motion with no mv overhead. xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset; xd->plane[0].pre[0].stride = golden_frame->y_stride; @@ -682,13 +776,22 @@ static int firstpass_inter_prediction( // Motion search in 3rd reference frame. int alt_motion_error = motion_error; - if (alt_ref_frame != NULL) { - xd->plane[0].pre[0].buf = alt_ref_frame->y_buffer + alt_ref_frame_yoffset; - xd->plane[0].pre[0].stride = alt_ref_frame->y_stride; - alt_motion_error = - get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize, - &x->plane[0].src, &xd->plane[0].pre[0]); - first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error); + // The ML model to predict if a flat structure (golden-frame only structure + // without ALT-REF and Internal-ARFs) is better requires stats based on + // motion search w.r.t 3rd reference frame in the first pass. As the ML + // model is enabled under certain conditions, motion search in 3rd reference + // frame is also enabled for those cases. + if (do_third_ref_motion_search(&cpi->oxcf.rc_cfg, &cpi->oxcf.gf_cfg)) { + if (alt_ref_frame != NULL) { + FULLPEL_MV tmp_mv = kZeroFullMv; + xd->plane[0].pre[0].buf = + alt_ref_frame->y_buffer + alt_ref_frame_yoffset; + xd->plane[0].pre[0].stride = alt_ref_frame->y_stride; + alt_motion_error = get_prediction_error_bitdepth( + is_high_bitdepth, bitdepth, bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); + first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error); + } } if (alt_motion_error < motion_error && alt_motion_error < gf_motion_error && alt_motion_error < this_intra_error) { @@ -716,8 +819,7 @@ static int firstpass_inter_prediction( } // Start by assuming that intra mode is best. - best_ref_mv->row = 0; - best_ref_mv->col = 0; + *best_mv = kZeroMv; if (motion_error <= this_intra_error) { aom_clear_system_state(); @@ -736,28 +838,30 @@ static int firstpass_inter_prediction( (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error); } - const MV best_mv = get_mv_from_fullmv(&mv); + *best_mv = get_mv_from_fullmv(&mv); this_inter_error = motion_error; xd->mi[0]->mode = NEWMV; - xd->mi[0]->mv[0].as_mv = best_mv; + xd->mi[0]->mv[0].as_mv = *best_mv; xd->mi[0]->tx_size = TX_4X4; xd->mi[0]->ref_frame[0] = LAST_FRAME; xd->mi[0]->ref_frame[1] = NONE_FRAME; - av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale, - unit_col * unit_scale, NULL, bsize, - AOM_PLANE_Y, AOM_PLANE_Y); - av1_encode_sby_pass1(cpi, x, bsize); - stats->sum_mvr += best_mv.row; - stats->sum_mvr_abs += abs(best_mv.row); - stats->sum_mvc += best_mv.col; - stats->sum_mvc_abs += abs(best_mv.col); - stats->sum_mvrs += best_mv.row * best_mv.row; - stats->sum_mvcs += best_mv.col * best_mv.col; + + if (cpi->sf.fp_sf.disable_recon == 0) { + av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale, + unit_col * unit_scale, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + av1_encode_sby_pass1(cpi, x, bsize); + } + stats->sum_mvr += best_mv->row; + stats->sum_mvr_abs += abs(best_mv->row); + stats->sum_mvc += best_mv->col; + stats->sum_mvc_abs += abs(best_mv->col); + stats->sum_mvrs += best_mv->row * best_mv->row; + stats->sum_mvcs += best_mv->col * best_mv->col; ++stats->inter_count; - *best_ref_mv = best_mv; - accumulate_mv_stats(best_mv, mv, unit_row, unit_col, unit_rows, unit_cols, - last_mv, stats); + accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols, + last_non_zero_mv, stats); } return this_inter_error; @@ -783,7 +887,7 @@ static void update_firstpass_stats(AV1_COMP *cpi, const int frame_number, const int64_t ts_duration, const BLOCK_SIZE fp_block_size) { - TWO_PASS *twopass = &cpi->twopass; + TWO_PASS *twopass = &cpi->ppi->twopass; AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end; @@ -817,6 +921,9 @@ static void update_firstpass_stats(AV1_COMP *cpi, fps.inactive_zone_rows = (double)stats->image_data_start_row; fps.inactive_zone_cols = (double)0; // TODO(paulwilkins): fix fps.raw_error_stdev = raw_err_stdev; + fps.is_flash = 0; + fps.noise_var = (double)0; + fps.cor_coeff = (double)1.0; if (stats->mv_count > 0) { fps.MVr = (double)stats->sum_mvr / stats->mv_count; @@ -849,12 +956,20 @@ static void update_firstpass_stats(AV1_COMP *cpi, // cpi->source_time_stamp. fps.duration = (double)ts_duration; + // Invalidate the stats related to third ref motion search if not valid. + // This helps to print a warning in second pass encoding. + if (do_third_ref_motion_search(&cpi->oxcf.rc_cfg, &cpi->oxcf.gf_cfg) == 0) { + fps.pcnt_third_ref = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP; + fps.tr_coded_error = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP; + } + // We will store the stats inside the persistent twopass struct (and NOT the // local variable 'fps'), and then cpi->output_pkt_list will point to it. *this_frame_stats = fps; - output_stats(this_frame_stats, cpi->output_pkt_list); - if (cpi->twopass.stats_buf_ctx->total_stats != NULL) { - av1_accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps); + if (!cpi->ppi->lap_enabled) + output_stats(this_frame_stats, cpi->ppi->output_pkt_list); + if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) { + av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps); } /*In the case of two pass, first pass uses it as a circular buffer, * when LAP is enabled it is used as a linear buffer*/ @@ -982,6 +1097,17 @@ static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) { AV1_COMMON *const cm = &cpi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; + const int num_planes = av1_num_planes(&cpi->common); + for (int plane = 0; plane < num_planes; plane++) { + const int subsampling_xy = + plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y + : 0; + const int sb_size = MAX_SB_SQUARE >> subsampling_xy; + CHECK_MEM_ERROR( + cm, cpi->td.mb.plane[plane].src_diff, + (int16_t *)aom_memalign( + 32, sizeof(*cpi->td.mb.plane[plane].src_diff) * sb_size)); + } for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *const tile_data = @@ -989,6 +1115,12 @@ static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) { first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size); } } + for (int plane = 0; plane < num_planes; plane++) { + if (cpi->td.mb.plane[plane].src_diff) { + aom_free(cpi->td.mb.plane[plane].src_diff); + cpi->td.mb.plane[plane].src_diff = NULL; + } + } } void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, @@ -997,7 +1129,7 @@ void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; CurrentFrame *const current_frame = &cm->current_frame; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; TileInfo *tile = &tile_data->tile_info; @@ -1105,7 +1237,7 @@ void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, cpi, td, last_frame, golden_frame, alt_ref_frame, unit_row, unit_col, recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset, fp_block_size, this_intra_error, raw_motion_err_counts, - raw_motion_err_list, &best_ref_mv, &last_mv, mb_stats); + raw_motion_err_list, best_ref_mv, &best_ref_mv, &last_mv, mb_stats); if (unit_col_in_tile == 0) { *first_top_mv = last_mv; } @@ -1138,7 +1270,7 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) { AV1_COMMON *const cm = &cpi->common; const CommonModeInfoParams *const mi_params = &cm->mi_params; CurrentFrame *const current_frame = &cm->current_frame; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; const int qindex = find_fp_qindex(seq_params->bit_depth); @@ -1147,9 +1279,14 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) { FeatureFlags *const features = &cm->features; av1_set_screen_content_options(cpi, features); } + + // Prepare the speed features + av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); + // Unit size for the first pass encoding. const BLOCK_SIZE fp_block_size = - cpi->is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16; + get_fp_block_size(cpi->is_screen_content_type); + // Number of rows in the unit size. // Note mi_params->mb_rows and mi_params->mb_cols are in the unit of 16x16. const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows); @@ -1250,7 +1387,7 @@ void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) { (stats.image_data_start_row * unit_cols * 2)); } - TWO_PASS *twopass = &cpi->twopass; + TWO_PASS *twopass = &cpi->ppi->twopass; const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : mi_params->MBs; diff --git a/third_party/libaom/source/libaom/av1/encoder/firstpass.h b/third_party/libaom/source/libaom/av1/encoder/firstpass.h index 22969e885b..122912f72a 100644 --- a/third_party/libaom/source/libaom/av1/encoder/firstpass.h +++ b/third_party/libaom/source/libaom/av1/encoder/firstpass.h @@ -152,6 +152,18 @@ typedef struct { * standard deviation for (0, 0) motion prediction error */ double raw_error_stdev; + /*! + * Whether the frame contains a flash + */ + int64_t is_flash; + /*! + * Estimated noise variance + */ + double noise_var; + /*! + * Correlation coefficient with the previous frame + */ + double cor_coeff; } FIRSTPASS_STATS; /*!\cond */ @@ -170,8 +182,6 @@ enum { */ typedef struct { /*!\cond */ - // The frame processing order within a GOP - unsigned char index; // Frame update type, e.g. ARF/GF/LF/Overlay FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH]; unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH]; @@ -191,6 +201,21 @@ typedef struct { REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH]; int arf_index; // the index in the gf group of ARF, if no arf, then -1 int size; // The total length of a GOP +#if CONFIG_FRAME_PARALLEL_ENCODE + // Indicates the level of parallelism in frame parallel encodes. + // 0 : frame is independently encoded (not part of parallel encodes). + // 1 : frame is the first in encode order in a given parallel encode set. + // 2 : frame occurs later in encode order in a given parallel encode set. + int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH]; + // Indicates whether a frame should act as non-reference frame. + // 0 : frame is a reference frame. + // 1 : frame is a non-reference frame. + int is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH]; + + // The offset into lookahead_ctx for choosing + // source of frame parallel encodes. + int src_offset[MAX_STATIC_GF_GROUP_LENGTH]; +#endif // CONFIG_FRAME_PARALLEL_ENCODE /*!\endcond */ } GF_GROUP; /*!\cond */ @@ -327,6 +352,15 @@ struct EncodeFrameParams; struct AV1EncoderConfig; struct TileDataEnc; +static INLINE int is_fp_wavelet_energy_invalid( + const FIRSTPASS_STATS *fp_stats) { + return (fp_stats->frame_avg_wavelet_energy < 0); +} + +static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) { + return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16); +} + int av1_get_unit_rows_in_tile(TileInfo tile, const BLOCK_SIZE fp_block_size); int av1_get_unit_cols_in_tile(TileInfo tile, const BLOCK_SIZE fp_block_size); diff --git a/third_party/libaom/source/libaom/av1/encoder/global_motion_facade.c b/third_party/libaom/source/libaom/av1/encoder/global_motion_facade.c index 31c69da7eb..01ef7b0843 100644 --- a/third_party/libaom/source/libaom/av1/encoder/global_motion_facade.c +++ b/third_party/libaom/source/libaom/av1/encoder/global_motion_facade.c @@ -108,10 +108,10 @@ static AOM_INLINE void compute_global_motion_for_ref_frame( const int do_adaptive_gm_estimation = 0; const int ref_frame_dist = get_relative_dist( - &cm->seq_params.order_hint_info, cm->current_frame.order_hint, + &cm->seq_params->order_hint_info, cm->current_frame.order_hint, cm->cur_frame->ref_order_hints[frame - LAST_FRAME]); const GlobalMotionEstimationType gm_estimation_type = - cm->seq_params.order_hint_info.enable_order_hint && + cm->seq_params->order_hint_info.enable_order_hint && abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation ? GLOBAL_MOTION_DISFLOW_BASED : GLOBAL_MOTION_FEATURE_BASED; @@ -126,7 +126,7 @@ static AOM_INLINE void compute_global_motion_for_ref_frame( av1_compute_global_motion(model, src_buffer, src_width, src_height, src_stride, src_corners, num_src_corners, - ref_buf[frame], cpi->common.seq_params.bit_depth, + ref_buf[frame], cpi->common.seq_params->bit_depth, gm_estimation_type, inliers_by_motion, params_by_motion, RANSAC_NUM_MOTIONS); int64_t ref_frame_error = 0; @@ -284,9 +284,9 @@ static AOM_INLINE void update_valid_ref_frames_for_gm( AV1_COMMON *const cm = &cpi->common; int *num_past_ref_frames = &num_ref_frames[0]; int *num_future_ref_frames = &num_ref_frames[1]; - const GF_GROUP *gf_group = &cpi->gf_group; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; int ref_pruning_enabled = is_frame_eligible_for_ref_pruning( - gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, gf_group->index); + gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index); for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) { const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME }; @@ -368,7 +368,7 @@ static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) { // The source buffer is 16-bit, so we need to convert to 8 bits for the // following code. We cache the result until the source frame is released. gm_info->src_buffer = - av1_downconvert_frame(source, cpi->common.seq_params.bit_depth); + av1_downconvert_frame(source, cpi->common.seq_params->bit_depth); } gm_info->segment_map_w = diff --git a/third_party/libaom/source/libaom/av1/encoder/gop_structure.c b/third_party/libaom/source/libaom/av1/encoder/gop_structure.c index 0e4968a72f..9cf72d2733 100644 --- a/third_party/libaom/source/libaom/av1/encoder/gop_structure.c +++ b/third_party/libaom/source/libaom/av1/encoder/gop_structure.c @@ -26,12 +26,51 @@ #include "av1/encoder/firstpass.h" #include "av1/encoder/gop_structure.h" +#if CONFIG_FRAME_PARALLEL_ENCODE +// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based +// on the value of parallel_frame_count. +static void set_frame_parallel_level(int *frame_parallel_level, + int *parallel_frame_count, + int max_parallel_frames) { + assert(*parallel_frame_count > 0); + // parallel_frame_count > 1 indicates subsequent frame(s) in the current + // parallel encode set. + *frame_parallel_level = 1 + (*parallel_frame_count > 1); + // Update the count of no. of parallel frames. + (*parallel_frame_count)++; + if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1; +} + +// This function sets gf_group->src_offset based on frame_parallel_level. +// Outputs are gf_group->src_offset and first_frame_index +static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index, + int cur_frame_idx, int frame_ind) { + if (gf_group->frame_parallel_level[frame_ind] > 0) { + if (gf_group->frame_parallel_level[frame_ind] == 1) { + *first_frame_index = cur_frame_idx; + } + + // Obtain the offset of the frame at frame_ind in the lookahead queue by + // subtracting the display order hints of the current frame from the display + // order hint of the first frame in parallel encoding set (at + // first_frame_index). + gf_group->src_offset[frame_ind] = + (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) - + *first_frame_index; + } +} +#endif // CONFIG_FRAME_PARALLEL_ENCODE + // Set parameters for frames between 'start' and 'end' (excluding both). -static void set_multi_layer_params(const TWO_PASS *twopass, - GF_GROUP *const gf_group, RATE_CONTROL *rc, - FRAME_INFO *frame_info, int start, int end, - int *cur_frame_idx, int *frame_ind, - int layer_depth) { +static void set_multi_layer_params( + const TWO_PASS *twopass, GF_GROUP *const gf_group, + const PRIMARY_RATE_CONTROL *p_rc, RATE_CONTROL *rc, FRAME_INFO *frame_info, + int start, int end, int *cur_frame_idx, int *frame_ind, +#if CONFIG_FRAME_PARALLEL_ENCODE + int *parallel_frame_count, int max_parallel_frames, + int do_frame_parallel_encode, int *first_frame_index, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + int layer_depth) { const int num_frames_to_process = end - start; // Either we are at the last level of the pyramid, or we don't have enough @@ -45,11 +84,21 @@ static void set_multi_layer_params(const TWO_PASS *twopass, gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx; gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS; gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost( - twopass, rc, frame_info, start, end - start, 0, NULL, NULL); + twopass, p_rc, rc, frame_info, start, end - start, 0, NULL, NULL, 0); gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth); +#if CONFIG_FRAME_PARALLEL_ENCODE + // Set the level of parallelism for the LF_UPDATE frame. + if (do_frame_parallel_encode) { + set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind], + parallel_frame_count, max_parallel_frames); + // Set LF_UPDATE frames as non-reference frames. + gf_group->is_frame_non_ref[*frame_ind] = 1; + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); +#endif // CONFIG_FRAME_PARALLEL_ENCODE ++(*frame_ind); ++(*cur_frame_idx); ++start; @@ -65,14 +114,32 @@ static void set_multi_layer_params(const TWO_PASS *twopass, gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; +#if CONFIG_FRAME_PARALLEL_ENCODE + if (do_frame_parallel_encode) { + // If max_parallel_frames is not exceeded, encode the next internal ARF + // frame in parallel. + if (*parallel_frame_count > 1 && + *parallel_frame_count <= max_parallel_frames) { + gf_group->frame_parallel_level[*frame_ind] = 2; + *parallel_frame_count = 1; + } + } + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); +#endif // CONFIG_FRAME_PARALLEL_ENCODE + // Get the boost factor for intermediate ARF frames. gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost( - twopass, rc, frame_info, m, end - m, m - start, NULL, NULL); + twopass, p_rc, rc, frame_info, m, end - m, m - start, NULL, NULL, 0); ++(*frame_ind); // Frames displayed before this internal ARF. - set_multi_layer_params(twopass, gf_group, rc, frame_info, start, m, - cur_frame_idx, frame_ind, layer_depth + 1); + set_multi_layer_params(twopass, gf_group, p_rc, rc, frame_info, start, m, + cur_frame_idx, frame_ind, +#if CONFIG_FRAME_PARALLEL_ENCODE + parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + layer_depth + 1); // Overlay for internal ARF. gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; @@ -82,12 +149,21 @@ static void set_multi_layer_params(const TWO_PASS *twopass, gf_group->layer_depth[*frame_ind] = layer_depth; gf_group->frame_type[*frame_ind] = INTER_FRAME; gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE; + +#if CONFIG_FRAME_PARALLEL_ENCODE + set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind); +#endif // CONFIG_FRAME_PARALLEL_ENCODE ++(*frame_ind); ++(*cur_frame_idx); // Frames displayed after this internal ARF. - set_multi_layer_params(twopass, gf_group, rc, frame_info, m + 1, end, - cur_frame_idx, frame_ind, layer_depth + 1); + set_multi_layer_params(twopass, gf_group, p_rc, rc, frame_info, m + 1, end, + cur_frame_idx, frame_ind, +#if CONFIG_FRAME_PARALLEL_ENCODE + parallel_frame_count, max_parallel_frames, + do_frame_parallel_encode, first_frame_index, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + layer_depth + 1); } } @@ -95,6 +171,7 @@ static int construct_multi_layer_gf_structure( AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group, RATE_CONTROL *rc, FRAME_INFO *const frame_info, int gf_interval, FRAME_UPDATE_TYPE first_frame_update_type) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int frame_index = 0; int cur_frame_index = 0; @@ -103,6 +180,18 @@ static int construct_multi_layer_gf_structure( first_frame_update_type == OVERLAY_UPDATE || first_frame_update_type == GF_UPDATE); +#if CONFIG_FRAME_PARALLEL_ENCODE + // Initialize gf_group->frame_parallel_level and gf_group->is_frame_non_ref to + // 0. + memset( + gf_group->frame_parallel_level, 0, + sizeof(gf_group->frame_parallel_level[0]) * MAX_STATIC_GF_GROUP_LENGTH); + memset(gf_group->is_frame_non_ref, 0, + sizeof(gf_group->is_frame_non_ref[0]) * MAX_STATIC_GF_GROUP_LENGTH); + memset(gf_group->src_offset, 0, + sizeof(gf_group->src_offset[0]) * MAX_STATIC_GF_GROUP_LENGTH); +#endif + if (first_frame_update_type == KF_UPDATE && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1) { gf_group->update_type[frame_index] = ARF_UPDATE; @@ -146,7 +235,7 @@ static int construct_multi_layer_gf_structure( gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index; gf_group->cur_frame_idx[frame_index] = cur_frame_index; gf_group->layer_depth[frame_index] = 1; - gf_group->arf_boost[frame_index] = cpi->rc.gfu_boost; + gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost; gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME; gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; gf_group->max_layer_depth = 1; @@ -156,9 +245,25 @@ static int construct_multi_layer_gf_structure( gf_group->arf_index = -1; } +#if CONFIG_FRAME_PARALLEL_ENCODE + // Running count of no. of frames that is part of a given parallel + // encode set in a gf_group. Value of 1 indicates no parallel encode. + int parallel_frame_count = 1; + // Enable parallel encode of frames if gf_group has a multi-layer pyramid + // structure. + int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref); + + int first_frame_index = cur_frame_index; +#endif // CONFIG_FRAME_PARALLEL_ENCODE + // Rest of the frames. - set_multi_layer_params(twopass, gf_group, rc, frame_info, cur_frame_index, - gf_interval, &cur_frame_index, &frame_index, + set_multi_layer_params(twopass, gf_group, p_rc, rc, frame_info, + cur_frame_index, gf_interval, &cur_frame_index, + &frame_index, +#if CONFIG_FRAME_PARALLEL_ENCODE + ¶llel_frame_count, cpi->ppi->num_fp_contexts, + do_frame_parallel_encode, &first_frame_index, +#endif // CONFIG_FRAME_PARALLEL_ENCODE use_altref + 1); if (use_altref) { @@ -181,25 +286,41 @@ static int construct_multi_layer_gf_structure( gf_group->frame_type[frame_index] = INTER_FRAME; gf_group->refbuf_state[frame_index] = REFBUF_UPDATE; gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2); +#if CONFIG_FRAME_PARALLEL_ENCODE + set_src_offset(gf_group, &first_frame_index, cur_frame_index, + frame_index); +#endif ++frame_index; } } +#if CONFIG_FRAME_PARALLEL_ENCODE + if (do_frame_parallel_encode) { + // If frame_parallel_level is set to 1 for the last LF_UPDATE + // frame in the gf_group, reset it to zero since there are no subsequent + // frames in the gf_group. + if (gf_group->frame_parallel_level[frame_index - 2] == 1) { + assert(gf_group->update_type[frame_index - 2] == LF_UPDATE); + gf_group->frame_parallel_level[frame_index - 2] = 0; + } + } +#endif return frame_index; } void av1_gop_setup_structure(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; - GF_GROUP *const gf_group = &cpi->gf_group; - TWO_PASS *const twopass = &cpi->twopass; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + TWO_PASS *const twopass = &cpi->ppi->twopass; FRAME_INFO *const frame_info = &cpi->frame_info; const int key_frame = rc->frames_since_key == 0; const FRAME_UPDATE_TYPE first_frame_update_type = - key_frame - ? KF_UPDATE - : cpi->gf_state.arf_gf_boost_lst || (rc->baseline_gf_interval == 1) - ? OVERLAY_UPDATE - : GF_UPDATE; + key_frame ? KF_UPDATE + : cpi->ppi->gf_state.arf_gf_boost_lst || + (p_rc->baseline_gf_interval == 1) + ? OVERLAY_UPDATE + : GF_UPDATE; gf_group->size = construct_multi_layer_gf_structure( - cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval - 1, + cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval - 1, first_frame_update_type); } diff --git a/third_party/libaom/source/libaom/av1/encoder/gop_structure.h b/third_party/libaom/source/libaom/av1/encoder/gop_structure.h index 6cfca22862..aeffb40acb 100644 --- a/third_party/libaom/source/libaom/av1/encoder/gop_structure.h +++ b/third_party/libaom/source/libaom/av1/encoder/gop_structure.h @@ -66,10 +66,11 @@ void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, int64_t gf_group_bits); /*!\cond */ -int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc, +int av1_calc_arf_boost(const TWO_PASS *twopass, + const PRIMARY_RATE_CONTROL *p_rc, const RATE_CONTROL *rc, FRAME_INFO *frame_info, int offset, int f_frames, int b_frames, int *num_fpstats_used, - int *num_fpstats_required); + int *num_fpstats_required, int project_gfu_boost); /*!\endcond */ #ifdef __cplusplus diff --git a/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.c b/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.c index 08c167a9d6..eda5ddf78c 100644 --- a/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.c +++ b/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.c @@ -14,6 +14,7 @@ #include "config/aom_dsp_rtcd.h" #include "av1/common/idct.h" +#include "av1/common/blockd.h" #include "av1/encoder/hybrid_fwd_txfm.h" /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per @@ -313,3 +314,26 @@ void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, default: assert(0); break; } } + +void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info, + const int16_t *src_diff, int src_stride, + tran_low_t *coeff) { + if (use_hadamard) { + switch (tx_size) { + case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break; + case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break; + case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break; + case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break; + default: assert(0); + } + } else { + TxfmParam txfm_param; + txfm_param.tx_type = DCT_DCT; + txfm_param.tx_size = tx_size; + txfm_param.lossless = 0; + txfm_param.bd = bd_info.bit_depth; + txfm_param.is_hbd = bd_info.use_highbitdepth_buf; + txfm_param.tx_set_type = EXT_TX_SET_ALL16; + av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param); + } +} diff --git a/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.h b/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.h index daabc7119a..30f8a2258b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.h +++ b/third_party/libaom/source/libaom/av1/encoder/hybrid_fwd_txfm.h @@ -24,6 +24,15 @@ void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param); +/*!\brief Apply Hadamard or DCT transform + * + * \callergraph + * DCT and Hadamard transforms are commonly used for quick RD score estimation. + * The coeff buffer's size should be equal to the number of pixels + * corresponding to tx_size. + */ +void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info, + const int16_t *src_diff, int src_stride, tran_low_t *coeff); #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/libaom/source/libaom/av1/encoder/interp_search.c b/third_party/libaom/source/libaom/av1/encoder/interp_search.c index 0066c35434..dd77f6a1c0 100644 --- a/third_party/libaom/source/libaom/av1/encoder/interp_search.c +++ b/third_party/libaom/source/libaom/av1/encoder/interp_search.c @@ -178,7 +178,7 @@ static INLINE int64_t interpolation_filter_rd( mbmi->interp_filters = filter_sets[filter_idx]; const int tmp_rs = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx, - cm->seq_params.enable_dual_filter); + cm->seq_params->enable_dual_filter); int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0); if (min_rd > *rd) { @@ -449,14 +449,23 @@ static INLINE void find_best_non_dual_interp_filter( interp_search_flags->interp_filter_search_mask; if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) { - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0); const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1); - const int *switchable_interp_p0 = - cpi->frame_probs.switchable_interp_probs[update_type][ctx0]; - const int *switchable_interp_p1 = - cpi->frame_probs.switchable_interp_probs[update_type][ctx1]; - + int *switchable_interp_p0; + int *switchable_interp_p1; +#if CONFIG_FRAME_PARALLEL_ENCODE + switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs + .switchable_interp_probs[update_type][ctx0]; + switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs + .switchable_interp_probs[update_type][ctx1]; +#else + switchable_interp_p0 = + (int *)cpi->frame_probs.switchable_interp_probs[update_type][ctx0]; + switchable_interp_p1 = + (int *)cpi->frame_probs.switchable_interp_probs[update_type][ctx1]; +#endif static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 }; const int thresh = thr[update_type]; for (i = 0; i < SWITCHABLE_FILTERS; i++) { @@ -683,7 +692,7 @@ int64_t av1_interpolation_filter_search( switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); *switchable_rate = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx, - cm->seq_params.enable_dual_filter); + cm->seq_params->enable_dual_filter); // Do MC evaluation for default filter_type. // Luma MC @@ -747,7 +756,7 @@ int64_t av1_interpolation_filter_search( restore_dst_buf(xd, *tmp_dst, num_planes); const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst }; // Evaluate dual interp filters - if (cm->seq_params.enable_dual_filter) { + if (cm->seq_params->enable_dual_filter) { if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) { fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats, switchable_rate, diff --git a/third_party/libaom/source/libaom/av1/encoder/interp_search.h b/third_party/libaom/source/libaom/av1/encoder/interp_search.h index 1ee26d11ba..902b69960a 100644 --- a/third_party/libaom/source/libaom/av1/encoder/interp_search.h +++ b/third_party/libaom/source/libaom/av1/encoder/interp_search.h @@ -37,7 +37,7 @@ typedef struct { /*!\brief Miscellaneous arguments for inter mode search. */ -typedef struct { +typedef struct HandleInterModeArgs { /*! * Buffer for the above predictor in OBMC */ @@ -139,6 +139,16 @@ typedef struct { * Estimated cmp mode. */ int cmp_mode[MODE_CTX_REF_FRAMES]; + /*! + * The best sse during single new_mv search. Note that the sse here comes from + * single_motion_search, and not from interpolation_filter_search. This has + * two implications: + * 1. The mv used to calculate the sse here does not have to be the best sse + * found in handle_inter_mode. + * 2. Even if the mvs agree, the sse here can differ from the sse in \ref + * MACROBLOCK::pred_sse due to different interpolation filter used. + */ + unsigned int best_single_sse_in_refs[REF_FRAMES]; } HandleInterModeArgs; /*!\cond */ diff --git a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.c b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.c index 9cb0f4a118..50e53fdde1 100644 --- a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.c +++ b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.c @@ -32,6 +32,31 @@ static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = { UV_D113_PRED, UV_D45_PRED, }; +// The bitmask corresponds to the filter intra modes as defined in enums.h +// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to +// disable the evaluation of corresponding filter intra mode. The table +// av1_derived_filter_intra_mode_used_flag is used when speed feature +// prune_filter_intra_level is 1. The evaluated filter intra modes are union +// of the following: +// 1) FILTER_DC_PRED +// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED, +// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED). +static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = { + 0x01, // DC_PRED: 0000 0001 + 0x03, // V_PRED: 0000 0011 + 0x05, // H_PRED: 0000 0101 + 0x01, // D45_PRED: 0000 0001 + 0x01, // D135_PRED: 0000 0001 + 0x01, // D113_PRED: 0000 0001 + 0x09, // D157_PRED: 0000 1001 + 0x01, // D203_PRED: 0000 0001 + 0x01, // D67_PRED: 0000 0001 + 0x01, // SMOOTH_PRED: 0000 0001 + 0x01, // SMOOTH_V_PRED: 0000 0001 + 0x01, // SMOOTH_H_PRED: 0000 0001 + 0x11 // PAETH_PRED: 0001 0001 +}; + // The bitmask corresponds to the chroma intra modes as defined in enums.h // UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to // disable the evaluation of corresponding chroma intra mode. The table @@ -60,59 +85,6 @@ static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = { }; /*!\endcond */ -/*!\brief Calculate the rdcost of a given luma intra angle - * - * \ingroup intra_mode_search - * \callergraph - * This function runs rd calculation for a given luma intra prediction angle. - * This is used to select the best angle delta. - * - * \return Returns the rdcost of the angle and updates the mbmi if the - * new rdcost is better. - */ -static int64_t calc_rd_given_intra_angle( - const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost, - int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate, - RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size, - int64_t *best_rd, int64_t *best_model_rd, uint8_t *best_tx_type_map, - uint8_t *best_blk_skip, int skip_model_rd) { - RD_STATS tokenonly_rd_stats; - int64_t this_rd; - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = xd->mi[0]; - const int n4 = bsize_to_num_blk(bsize); - assert(!is_inter_block(mbmi)); - mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta; - if (!skip_model_rd) { - if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) { - return INT64_MAX; - } - } - av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, - best_rd_in); - if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX; - - int this_rate = - mode_cost + tokenonly_rd_stats.rate + - x->mode_costs - .angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta]; - this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); - - if (this_rd < *best_rd) { - memcpy(best_blk_skip, x->txfm_search_info.blk_skip, - sizeof(best_blk_skip[0]) * n4); - av1_copy_array(best_tx_type_map, xd->tx_type_map, n4); - *best_rd = this_rd; - *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y]; - *best_tx_size = mbmi->tx_size; - *rate = this_rate; - rd_stats->rate = tokenonly_rd_stats.rate; - rd_stats->dist = tokenonly_rd_stats.dist; - rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm; - } - return this_rd; -} - /*!\brief Search for the best filter_intra mode when coding intra frame. * * \ingroup intra_mode_search @@ -125,8 +97,12 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize, int mode_cost, + PREDICTION_MODE best_mode_so_far, int64_t *best_rd, int64_t *best_model_rd, PICK_MODE_CONTEXT *ctx) { + // Skip the evaluation of filter intra modes. + if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0; + MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; int filter_intra_selected_flag = 0; @@ -134,17 +110,33 @@ static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x, TX_SIZE best_tx_size = TX_8X8; FILTER_INTRA_MODE_INFO filter_intra_mode_info; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; - (void)ctx; av1_zero(filter_intra_mode_info); mbmi->filter_intra_mode_info.use_filter_intra = 1; mbmi->mode = DC_PRED; mbmi->palette_mode_info.palette_size[0] = 0; + // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have + // filter-intra as winner. + if (x->use_mb_mode_cache && + !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra) + return 0; + for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) { int64_t this_rd; RD_STATS tokenonly_rd_stats; mbmi->filter_intra_mode_info.filter_intra_mode = mode; + if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) && + !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] & + (1 << mode))) + continue; + + // Skip the evaluation of modes that do not match with the winner mode in + // x->mb_mode_cache. + if (x->use_mb_mode_cache && + mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode) + continue; + if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) { continue; } @@ -248,6 +240,42 @@ void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, } } +void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi) { + if (mode_idx < INTRA_MODE_END) { + mbmi->mode = intra_rd_search_mode_order[mode_idx]; + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + } else { + mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED; + int angle_delta = (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2); + mbmi->angle_delta[PLANE_TYPE_Y] = + (angle_delta < 3 ? (angle_delta - 3) : (angle_delta - 2)); + } +} + +int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd, + int64_t top_intra_model_rd[], int model_cnt_allowed) { + const double thresh_best = 1.50; + const double thresh_top = 1.00; + for (int i = 0; i < model_cnt_allowed; i++) { + if (this_model_rd < top_intra_model_rd[i]) { + for (int j = model_cnt_allowed - 1; j > i; j--) { + top_intra_model_rd[j] = top_intra_model_rd[j - 1]; + } + top_intra_model_rd[i] = this_model_rd; + break; + } + } + if (top_intra_model_rd[model_cnt_allowed - 1] != INT64_MAX && + this_model_rd > thresh_top * top_intra_model_rd[model_cnt_allowed - 1]) + return 1; + + if (this_model_rd != INT64_MAX && + this_model_rd > thresh_best * (*best_model_rd)) + return 1; + if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd; + return 0; +} + // Run RD calculation with given chroma intra prediction angle., and return // the RD cost. Update the best mode info. if the RD cost is the best so far. static int64_t pick_intra_angle_routine_sbuv( @@ -342,125 +370,199 @@ static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, #define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \ (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1) -static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, - TX_SIZE tx_size, int64_t best_rd) { + +static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign, + int *cfl_alpha) { + int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO; + if (cfl_linear_idx == 0) { + *cfl_sign = CFL_SIGN_ZERO; + *cfl_alpha = 0; + } else { + *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG; + *cfl_alpha = abs(cfl_linear_idx) - 1; + } +} + +static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x, + int plane, TX_SIZE tx_size, + BLOCK_SIZE plane_bsize, int cfl_idx, + int fast_mode, RD_STATS *rd_stats) { + assert(IMPLIES(fast_mode, rd_stats == NULL)); + const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - const MACROBLOCKD_PLANE *pd = &xd->plane[AOM_PLANE_U]; - const ModeCosts *mode_costs = &x->mode_costs; - const BLOCK_SIZE plane_bsize = - get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); - - assert(is_cfl_allowed(xd) && cpi->oxcf.intra_mode_cfg.enable_cfl_intra); - assert(plane_bsize < BLOCK_SIZES_ALL); - if (!xd->lossless[mbmi->segment_id]) { - assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]); - assert(block_size_high[plane_bsize] == tx_size_high[tx_size]); + int cfl_plane = get_cfl_pred_type(plane); + CFL_SIGN_TYPE cfl_sign; + int cfl_alpha; + cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha); + // We conly build CFL for a given plane, the other plane's sign is dummy + int dummy_sign = CFL_SIGN_NEG; + const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs; + const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx; + mbmi->cfl_alpha_signs = + PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign); + mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha; + int64_t cfl_cost; + if (fast_mode) { + cfl_cost = + intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0); + } else { + av1_init_rd_stats(rd_stats); + av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize, + tx_size, FTXS_NONE, 0); + av1_rd_cost_update(x->rdmult, rd_stats); + cfl_cost = rd_stats->rdcost; } + mbmi->cfl_alpha_signs = orig_cfl_alpha_signs; + mbmi->cfl_alpha_idx = orig_cfl_alpha_idx; + return cfl_cost; +} + +static void cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x, + int plane, TX_SIZE tx_size, + int cfl_search_range, + RD_STATS cfl_rd_arr[CFL_MAGS_SIZE]) { + assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); + MACROBLOCKD *const xd = &x->e_mbd; xd->cfl.use_dc_pred_cache = 1; - const int64_t mode_rd = RDCOST( - x->rdmult, - mode_costs->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0); - int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; - int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; -#if CONFIG_DEBUG - int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES]; -#endif // CONFIG_DEBUG - - const int skip_trellis = 0; - for (int plane = 0; plane < CFL_PRED_PLANES; plane++) { - RD_STATS rd_stats; - av1_init_rd_stats(&rd_stats); - for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) { - best_rd_uv[joint_sign][plane] = INT64_MAX; - best_c[joint_sign][plane] = 0; - } - // Collect RD stats for an alpha value of zero in this plane. - // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid. - for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) { - const int8_t joint_sign = - PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i); - if (i == CFL_SIGN_NEG) { - mbmi->cfl_alpha_idx = 0; - mbmi->cfl_alpha_signs = joint_sign; - av1_txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, - plane_bsize, tx_size, FTXS_NONE, skip_trellis); - if (rd_stats.rate == INT_MAX) break; - } - const int alpha_rate = mode_costs->cfl_cost[joint_sign][plane][0]; - best_rd_uv[joint_sign][plane] = - RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist); -#if CONFIG_DEBUG - best_rate_uv[joint_sign][plane] = rd_stats.rate; -#endif // CONFIG_DEBUG - } - } - int8_t best_joint_sign = -1; - - for (int plane = 0; plane < CFL_PRED_PLANES; plane++) { - for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) { - int progress = 0; - for (int c = 0; c < CFL_ALPHABET_SIZE; c++) { - int flag = 0; - RD_STATS rd_stats; - if (c > 2 && progress < c) break; - av1_init_rd_stats(&rd_stats); - for (int i = 0; i < CFL_SIGNS; i++) { - const int8_t joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i); - if (i == 0) { - mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c; - mbmi->cfl_alpha_signs = joint_sign; - av1_txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, - plane_bsize, tx_size, FTXS_NONE, skip_trellis); - if (rd_stats.rate == INT_MAX) break; - } - const int alpha_rate = mode_costs->cfl_cost[joint_sign][plane][c]; - int64_t this_rd = - RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist); - if (this_rd >= best_rd_uv[joint_sign][plane]) continue; - best_rd_uv[joint_sign][plane] = this_rd; - best_c[joint_sign][plane] = c; -#if CONFIG_DEBUG - best_rate_uv[joint_sign][plane] = rd_stats.rate; -#endif // CONFIG_DEBUG - flag = 2; - if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue; - this_rd += mode_rd + best_rd_uv[joint_sign][!plane]; - if (this_rd >= best_rd) continue; - best_rd = this_rd; - best_joint_sign = joint_sign; + MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(mbmi->uv_mode == UV_CFL_PRED); + const MACROBLOCKD_PLANE *pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); + + const int dir_ls[2] = { 1, -1 }; + + int est_best_cfl_idx = CFL_INDEX_ZERO; + if (cfl_search_range < CFL_MAGS_SIZE) { + int fast_mode = 1; + int start_cfl_idx = CFL_INDEX_ZERO; + int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, + start_cfl_idx, fast_mode, NULL); + for (int si = 0; si < 2; ++si) { + const int dir = dir_ls[si]; + for (int i = 1; i < CFL_MAGS_SIZE; ++i) { + int cfl_idx = start_cfl_idx + dir * i; + if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break; + int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, + cfl_idx, fast_mode, NULL); + if (cfl_cost < best_cfl_cost) { + best_cfl_cost = cfl_cost; + est_best_cfl_idx = cfl_idx; + } else { + break; } - progress += flag; } } } - int best_rate_overhead = INT_MAX; - uint8_t ind = 0; - if (best_joint_sign >= 0) { - const int u = best_c[best_joint_sign][CFL_PRED_U]; - const int v = best_c[best_joint_sign][CFL_PRED_V]; - ind = (u << CFL_ALPHABET_SIZE_LOG2) + v; - best_rate_overhead = mode_costs->cfl_cost[best_joint_sign][CFL_PRED_U][u] + - mode_costs->cfl_cost[best_joint_sign][CFL_PRED_V][v]; -#if CONFIG_DEBUG - xd->cfl.rate = - mode_costs->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] + - best_rate_overhead + best_rate_uv[best_joint_sign][CFL_PRED_U] + - best_rate_uv[best_joint_sign][CFL_PRED_V]; -#endif // CONFIG_DEBUG - } else { - best_joint_sign = 0; + for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) { + av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]); } - mbmi->cfl_alpha_idx = ind; - mbmi->cfl_alpha_signs = best_joint_sign; + int fast_mode = 0; + int start_cfl_idx = est_best_cfl_idx; + cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode, + &cfl_rd_arr[start_cfl_idx]); + for (int si = 0; si < 2; ++si) { + const int dir = dir_ls[si]; + for (int i = 1; i < cfl_search_range; ++i) { + int cfl_idx = start_cfl_idx + dir * i; + if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break; + cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode, + &cfl_rd_arr[cfl_idx]); + } + } xd->cfl.use_dc_pred_cache = 0; xd->cfl.dc_pred_is_cached[0] = 0; xd->cfl.dc_pred_is_cached[1] = 0; - return best_rate_overhead; +} + +/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component + * + * \ingroup intra_mode_search + * \callergraph + * + * This function will use DCT_DCT followed by computing SATD (sum of absolute + * transformed differences) to estimate the RD score and find the best possible + * CFL parameter. + * + * Then the function will apply a full RD search near the best possible CFL + * parameter to find the best actual CFL parameter. + * + * Side effect: + * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD + * search. + * + * \param[in] x Encoder prediction block structure. + * \param[in] cpi Top-level encoder instance structure. + * \param[in] tx_size Transform size. + * \param[in] ref_best_rd Reference best RD. + * \param[in] cfl_search_range The search range of full RD search near the + * estimated best CFL parameter. + * + * \param[out] best_rd_stats RD stats of the best CFL parameter + * \param[out] best_cfl_alpha_idx Best CFL alpha index + * \param[out] best_cfl_alpha_signs Best CFL joint signs + * + */ +static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, + TX_SIZE tx_size, int64_t ref_best_rd, + int cfl_search_range, RD_STATS *best_rd_stats, + uint8_t *best_cfl_alpha_idx, + int8_t *best_cfl_alpha_signs) { + assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE); + const ModeCosts *mode_costs = &x->mode_costs; + RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE]; + RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE]; + + av1_invalid_rd_stats(best_rd_stats); + + cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u); + cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v); + + for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) { + if (cfl_rd_arr_u[ui].rate == INT_MAX) continue; + int cfl_alpha_u; + CFL_SIGN_TYPE cfl_sign_u; + cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u); + for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) { + if (cfl_rd_arr_v[vi].rate == INT_MAX) continue; + int cfl_alpha_v; + CFL_SIGN_TYPE cfl_sign_v; + cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v); + // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a + // valid parameter for CFL + if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue; + int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1; + RD_STATS rd_stats = cfl_rd_arr_u[ui]; + av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]); + if (rd_stats.rate != INT_MAX) { + rd_stats.rate += + mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u]; + rd_stats.rate += + mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v]; + } + av1_rd_cost_update(x->rdmult, &rd_stats); + if (rd_stats.rdcost < best_rd_stats->rdcost) { + *best_rd_stats = rd_stats; + *best_cfl_alpha_idx = + (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v; + *best_cfl_alpha_signs = joint_sign; + } + } + } + if (best_rd_stats->rdcost >= ref_best_rd) { + av1_invalid_rd_stats(best_rd_stats); + // Set invalid CFL parameters here since the rdcost is not better than + // ref_best_rd. + *best_cfl_alpha_idx = 0; + *best_cfl_alpha_signs = 0; + return 0; + } + return 1; } int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -532,19 +634,19 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, mbmi->uv_mode = mode; // Init variables for cfl and angle delta - int cfl_alpha_rate = 0; + const SPEED_FEATURES *sf = &cpi->sf; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; if (mode == UV_CFL_PRED) { if (!is_cfl_allowed(xd) || !intra_mode_cfg->enable_cfl_intra) continue; assert(!is_directional_mode); const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); - cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd); - if (cfl_alpha_rate == INT_MAX) continue; - } - mbmi->angle_delta[PLANE_TYPE_UV] = 0; - - if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) && - intra_mode_cfg->enable_angle_delta) { - const SPEED_FEATURES *sf = &cpi->sf; + if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd, + sf->intra_sf.cfl_search_range, &tokenonly_rd_stats, + &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) { + continue; + } + } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) && + intra_mode_cfg->enable_angle_delta) { if (sf->intra_sf.chroma_intra_pruning_with_hog && !intra_search_state.dir_mode_skip_mask_ready) { static const float thresh[2][4] = { @@ -554,7 +656,7 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, const int is_chroma = 1; const int is_intra_frame = frame_is_intra_only(cm); prune_intra_mode_with_hog( - x, bsize, + x, bsize, cm->seq_params->sb_size, thresh[is_intra_frame] [sf->intra_sf.chroma_intra_pruning_with_hog - 1], intra_search_state.directional_mode_skip_mask, is_chroma); @@ -577,17 +679,9 @@ int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } } const int mode_cost = - mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] + - cfl_alpha_rate; + mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode]; this_rate = tokenonly_rd_stats.rate + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost); - if (mode == UV_CFL_PRED) { - assert(is_cfl_allowed(xd) && intra_mode_cfg->enable_cfl_intra); -#if CONFIG_DEBUG - if (!xd->lossless[mbmi->segment_id]) - assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost); -#endif // CONFIG_DEBUG - } this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist); if (this_rd < best_rd) { @@ -633,8 +727,7 @@ int av1_search_palette_mode(IntraModeSearchState *intra_search_state, const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; int rate2 = 0; - int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd, - best_model_rd_palette = INT64_MAX; + int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd; int skippable = 0; uint8_t *const best_palette_color_map = x->palette_buffer->best_palette_color_map; @@ -656,11 +749,11 @@ int av1_search_palette_mode(IntraModeSearchState *intra_search_state, RD_STATS rd_stats_y; av1_invalid_rd_stats(&rd_stats_y); - av1_rd_pick_palette_intra_sby( - cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette, - best_palette_color_map, &best_rd_palette, &best_model_rd_palette, - &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL, - ctx, best_blk_skip, best_tx_type_map); + av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED], + &best_mbmi_palette, best_palette_color_map, + &best_rd_palette, &rd_stats_y.rate, NULL, + &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL, + ctx, best_blk_skip, best_tx_type_map); if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) { this_rd_cost->rdcost = INT64_MAX; return skippable; @@ -766,81 +859,6 @@ static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, return 0; } -/*!\brief Search for the best angle delta for luma prediction - * - * \ingroup intra_mode_search - * \callergraph - * Given a luma directional intra prediction mode, this function will try to - * estimate the best delta_angle. - * - * \return Returns the new rdcost of the best intra angle. - */ -static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x, - int *rate, RD_STATS *rd_stats, - BLOCK_SIZE bsize, int mode_cost, - int64_t best_rd, int64_t *best_model_rd, - int skip_model_rd_for_zero_deg) { - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = xd->mi[0]; - assert(!is_inter_block(mbmi)); - - int best_angle_delta = 0; - int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)]; - TX_SIZE best_tx_size = mbmi->tx_size; - uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; - uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; - - for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX; - - int first_try = 1; - for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { - for (int i = 0; i < 2; ++i) { - const int64_t best_rd_in = - (best_rd == INT64_MAX) ? INT64_MAX - : (best_rd + (best_rd >> (first_try ? 3 : 5))); - const int64_t this_rd = calc_rd_given_intra_angle( - cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta, - MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, - &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, - (skip_model_rd_for_zero_deg & !angle_delta)); - rd_cost[2 * angle_delta + i] = this_rd; - if (first_try && this_rd == INT64_MAX) return best_rd; - first_try = 0; - if (angle_delta == 0) { - rd_cost[1] = this_rd; - break; - } - } - } - - assert(best_rd != INT64_MAX); - for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) { - for (int i = 0; i < 2; ++i) { - int skip_search = 0; - const int64_t rd_thresh = best_rd + (best_rd >> 5); - if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh && - rd_cost[2 * (angle_delta - 1) + i] > rd_thresh) - skip_search = 1; - if (!skip_search) { - calc_rd_given_intra_angle( - cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta, - MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size, - &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, 0); - } - } - } - - if (rd_stats->rate != INT_MAX) { - mbmi->tx_size = best_tx_size; - mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta; - const int n4 = bsize_to_num_blk(bsize); - memcpy(x->txfm_search_info.blk_skip, best_blk_skip, - sizeof(best_blk_skip[0]) * n4); - av1_copy_array(xd->tx_type_map, best_tx_type_map, n4); - } - return best_rd; -} - /*!\brief Search for the best filter_intra mode when coding inter frame. * * \ingroup intra_mode_search @@ -909,11 +927,14 @@ static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x, } } +// Evaluate a given luma intra-mode in inter frames. int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int ref_frame_cost, const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, - int64_t best_rd, int *mode_cost_y, int64_t *rd_y) { + int64_t best_rd, int *mode_cost_y, int64_t *rd_y, + int64_t *best_model_rd, + int64_t top_intra_model_rd[]) { const AV1_COMMON *cm = &cpi->common; const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -928,7 +949,7 @@ int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, int known_rate = mode_cost; const int intra_cost_penalty = av1_get_intra_cost_penalty( cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty; known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0], @@ -946,32 +967,34 @@ int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, !intra_search_state->dir_mode_skip_mask_ready) { const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f }; const int is_chroma = 0; - prune_intra_mode_with_hog( - x, bsize, thresh[sf->intra_sf.intra_pruning_with_hog - 1], - intra_search_state->directional_mode_skip_mask, is_chroma); + prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size, + thresh[sf->intra_sf.intra_pruning_with_hog - 1], + intra_search_state->directional_mode_skip_mask, + is_chroma); intra_search_state->dir_mode_skip_mask_ready = 1; } if (intra_search_state->directional_mode_skip_mask[mode]) return 0; - av1_init_rd_stats(rd_stats_y); - rd_stats_y->rate = INT_MAX; - int64_t model_rd = INT64_MAX; - int rate_dummy; - rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, mode_cost, - best_rd, &model_rd, 0); - - } else { - av1_init_rd_stats(rd_stats_y); - mbmi->angle_delta[PLANE_TYPE_Y] = 0; - av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd); } + const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); + const int64_t this_model_rd = + intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1); + if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd, + sf->intra_sf.top_intra_model_count_allowed)) + return 0; + av1_init_rd_stats(rd_stats_y); + av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd); // Pick filter intra modes. if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) { int try_filter_intra = 1; int64_t best_rd_so_far = INT64_MAX; if (rd_stats_y->rate != INT_MAX) { - const int tmp_rate = rd_stats_y->rate + - mode_costs->filter_intra_cost[bsize][0] + mode_cost; + // best_rd_so_far is the rdcost of DC_PRED without using filter_intra. + // Later, in filter intra search, best_rd_so_far is used for comparison. + mbmi->filter_intra_mode_info.use_filter_intra = 0; + const int tmp_rate = + rd_stats_y->rate + + intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost); best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist); try_filter_intra = (best_rd_so_far / 2) <= best_rd; } @@ -1095,7 +1118,8 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f }; const int is_chroma = 0; prune_intra_mode_with_hog( - x, bsize, thresh[cpi->sf.intra_sf.intra_pruning_with_hog - 1], + x, bsize, cpi->common.seq_params->sb_size, + thresh[cpi->sf.intra_sf.intra_pruning_with_hog - 1], directional_mode_skip_mask, is_chroma); } mbmi->filter_intra_mode_info.use_filter_intra = 0; @@ -1105,16 +1129,21 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, set_mode_eval_params(cpi, x, MODE_EVAL); MB_MODE_INFO best_mbmi = *mbmi; - av1_zero(x->winner_mode_stats); + av1_zero_array(x->winner_mode_stats, MAX_WINNER_MODE_COUNT_INTRA); x->winner_mode_count = 0; // Searches the intra-modes except for intrabc, palette, and filter_intra. - for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) { + int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT]; + for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) { + top_intra_model_rd[i] = INT64_MAX; + } + for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT; + ++mode_idx) { + set_y_mode_and_delta_angle(mode_idx, mbmi); RD_STATS this_rd_stats; int this_rate, this_rate_tokenonly, s; int is_diagonal_mode; int64_t this_distortion, this_rd; - mbmi->mode = intra_rd_search_mode_order[mode_idx]; is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode); if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra) @@ -1132,36 +1161,43 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, continue; // The functionality of filter intra modes and smooth prediction - // overlap. Retain the smooth prediction if filter intra modes are - // disabled. + // overlap. Hence smooth prediction is pruned only if all the + // filter intra modes are enabled. if (cpi->sf.intra_sf.disable_smooth_intra && - !cpi->sf.intra_sf.disable_filter_intra && mbmi->mode == SMOOTH_PRED) + cpi->sf.intra_sf.prune_filter_intra_level == 0 && + mbmi->mode == SMOOTH_PRED) continue; if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue; - mbmi->angle_delta[PLANE_TYPE_Y] = 0; + + // Skip the evaluation of modes that do not match with the winner mode in + // x->mb_mode_cache. + if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue; is_directional_mode = av1_is_directional_mode(mbmi->mode); if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; - if (is_directional_mode && av1_use_angle_delta(bsize) && - cpi->oxcf.intra_mode_cfg.enable_angle_delta) { - // Searches through the best angle_delta if this option is available. - this_rd_stats.rate = INT_MAX; - rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize, - bmode_costs[mbmi->mode], best_rd, &best_model_rd, - 1); - } else { - if (model_intra_yrd_and_prune(cpi, x, bsize, &best_model_rd)) { - continue; - } + if (is_directional_mode && av1_use_angle_delta(bsize) == 0 && + mbmi->angle_delta[PLANE_TYPE_Y] != 0) + continue; - // Builds the actual prediction. The prediction from - // model_intra_yrd_and_prune was just an estimation that did not take into - // account the effect of txfm pipeline, so we need to redo it for real - // here. - av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd); - } + // Use intra_y_mode_mask speed feature to skip intra mode evaluation. + if (!(cpi->sf.intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & + (1 << mbmi->mode))) + continue; + + const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); + const int64_t this_model_rd = + intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1); + if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd, + cpi->sf.intra_sf.top_intra_model_count_allowed)) + continue; + + // Builds the actual prediction. The prediction from + // model_intra_yrd_and_prune was just an estimation that did not take into + // account the effect of txfm pipeline, so we need to redo it for real + // here. + av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd); this_rate_tokenonly = this_rd_stats.rate; this_distortion = this_rd_stats.dist; s = this_rd_stats.skip_txfm; @@ -1204,16 +1240,16 @@ int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (try_palette) { av1_rd_pick_palette_intra_sby( cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map, - &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable, - &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map); + &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd, + ctx, ctx->blk_skip, ctx->tx_type_map); } // Searches filter_intra - if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize) && - !cpi->sf.intra_sf.disable_filter_intra) { + if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) { if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion, skippable, bsize, bmode_costs[DC_PRED], - &best_rd, &best_model_rd, ctx)) { + best_mbmi.mode, &best_rd, &best_model_rd, + ctx)) { best_mbmi = *mbmi; } } diff --git a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.h b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.h index cc2a87b098..5a52440909 100644 --- a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.h +++ b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search.h @@ -95,6 +95,9 @@ typedef struct IntraModeSearchState { * \param[out] mode_cost_y The cost needed to signal the current * intra mode. * \param[out] rd_y The rdcost of the chosen mode. + * \param[in] best_model_rd Best model RD seen for this block so far + * \param[in] top_intra_model_rd Top intra model RD seen for this + * block so far. * * \return Returns 1 if a valid intra mode is found, 0 otherwise. * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and @@ -106,7 +109,9 @@ int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state, const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, unsigned int ref_frame_cost, const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y, - int64_t best_rd, int *mode_cost_y, int64_t *rd_y); + int64_t best_rd, int *mode_cost_y, int64_t *rd_y, + int64_t *best_model_rd, + int64_t top_intra_model_rd[]); /*!\brief Search through all chroma intra-modes for inter frames. * @@ -262,6 +267,29 @@ static AOM_INLINE void init_intra_mode_search_state( intra_search_state->rate_uv_intra = INT_MAX; } +/*! \brief set the luma intra mode and delta angles for a given mode index. + * The total number of luma intra mode is LUMA_MODE_COUNT = 61. + * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional + * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2 + * delta angles. + * \param[in] mode_idx mode index in intra mode decision + * process. + * \param[in] mbmi Pointer to structure holding + * the mode info for the current macroblock. + */ +void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi); + +/*! \brief prune luma intra mode based on the model rd. + * \param[in] this_model_rd model rd for current mode. + * \param[in] best_model_rd Best model RD seen for this block so + * far. + * \param[in] top_intra_model_rd Top intra model RD seen for this + * block so far. + * \param[in] model_cnt_allowed The number of top intra model RD allowed. + */ +int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd, + int64_t top_intra_model_rd[], int model_cnt_allowed); + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search_utils.h b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search_utils.h index 532482896a..0bf77ac9f5 100644 --- a/third_party/libaom/source/libaom/av1/encoder/intra_mode_search_utils.h +++ b/third_party/libaom/source/libaom/av1/encoder/intra_mode_search_utils.h @@ -22,8 +22,10 @@ #include "av1/common/reconintra.h" #include "av1/encoder/encoder.h" +#include "av1/encoder/encodeframe.h" #include "av1/encoder/model_rd.h" #include "av1/encoder/palette.h" +#include "av1/encoder/hybrid_fwd_txfm.h" #ifdef __cplusplus extern "C" { @@ -134,8 +136,13 @@ static AOM_INLINE int get_hist_bin_idx(int dx, int dy) { } #undef FIX_PREC_BITS -static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows, - int cols, float *hist) { +// Normalizes the hog data. +static AOM_INLINE void normalize_hog(float total, float *hist) { + for (int i = 0; i < BINS; ++i) hist[i] /= total; +} + +static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride, + int rows, int cols, float *hist) { float total = 0.1f; src += stride; for (int r = 1; r < rows - 1; ++r) { @@ -144,7 +151,7 @@ static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows, const uint8_t *below = &src[c + stride]; const uint8_t *left = &src[c - 1]; const uint8_t *right = &src[c + 1]; - // Calculate gradient using Sobel fitlers. + // Calculate gradient using Sobel filters. const int dx = (right[-stride] + 2 * right[0] + right[stride]) - (left[-stride] + 2 * left[0] + left[stride]); const int dy = (below[-1] + 2 * below[0] + below[1]) - @@ -165,13 +172,49 @@ static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows, src += stride; } - for (int i = 0; i < BINS; ++i) hist[i] /= total; + normalize_hog(total, hist); } -static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride, - int rows, int cols, float *hist) { +// Computes and stores pixel level gradient information of a given superblock +// for LBD encode. +static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { + PixelLevelGradientInfo *const grad_info_sb = + x->pixel_gradient_info + plane * MAX_SB_SQUARE; + const uint8_t *src = x->plane[plane].src.buf; + const int stride = x->plane[plane].src.stride; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_height = block_size_high[sb_size] >> ss_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + src += stride; + for (int r = 1; r < sb_height - 1; ++r) { + for (int c = 1; c < sb_width - 1; ++c) { + const uint8_t *above = &src[c - stride]; + const uint8_t *below = &src[c + stride]; + const uint8_t *left = &src[c - 1]; + const uint8_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0); + grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum = + (uint16_t)(abs(dx) + abs(dy)); + grad_info_sb[r * sb_width + c].hist_bin_idx = + (dx != 0) ? get_hist_bin_idx(dx, dy) : -1; + } + src += stride; + } +} + +#if CONFIG_AV1_HIGHBITDEPTH +static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride, + int rows, int cols, float *hist) { float total = 0.1f; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); src += stride; for (int r = 1; r < rows - 1; ++r) { for (int c = 1; c < cols - 1; ++c) { @@ -179,7 +222,7 @@ static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride, const uint16_t *below = &src[c + stride]; const uint16_t *left = &src[c - 1]; const uint16_t *right = &src[c + 1]; - // Calculate gradient using Sobel fitlers. + // Calculate gradient using Sobel filters. const int dx = (right[-stride] + 2 * right[0] + right[stride]) - (left[-stride] + 2 * left[0] + left[stride]); const int dy = (below[-1] + 2 * below[0] + below[1]) - @@ -200,11 +243,151 @@ static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride, src += stride; } - for (int i = 0; i < BINS; ++i) hist[i] /= total; + normalize_hog(total, hist); +} + +// Computes and stores pixel level gradient information of a given superblock +// for HBD encode. +static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { + PixelLevelGradientInfo *const grad_info_sb = + x->pixel_gradient_info + plane * MAX_SB_SQUARE; + const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf); + const int stride = x->plane[plane].src.stride; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_height = block_size_high[sb_size] >> ss_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + src += stride; + for (int r = 1; r < sb_height - 1; ++r) { + for (int c = 1; c < sb_width - 1; ++c) { + const uint16_t *above = &src[c - stride]; + const uint16_t *below = &src[c + stride]; + const uint16_t *left = &src[c - 1]; + const uint16_t *right = &src[c + 1]; + // Calculate gradient using Sobel filters. + const int dx = (right[-stride] + 2 * right[0] + right[stride]) - + (left[-stride] + 2 * left[0] + left[stride]); + const int dy = (below[-1] + 2 * below[0] + below[1]) - + (above[-1] + 2 * above[0] + above[1]); + grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0); + grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum = + (uint16_t)(abs(dx) + abs(dy)); + grad_info_sb[r * sb_width + c].hist_bin_idx = + (dx != 0) ? get_hist_bin_idx(dx, dy) : -1; + } + src += stride; + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + +static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows, + int cols, float *hist, int highbd) { +#if CONFIG_AV1_HIGHBITDEPTH + if (highbd) { + highbd_generate_hog(src8, stride, rows, cols, hist); + return; + } +#else + (void)highbd; +#endif // CONFIG_AV1_HIGHBITDEPTH + lowbd_generate_hog(src8, stride, rows, cols, hist); +} + +static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x, + BLOCK_SIZE sb_size, + PLANE_TYPE plane) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_cur_buf_hbd(&x->e_mbd)) { + highbd_compute_gradient_info_sb(x, sb_size, plane); + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + lowbd_compute_gradient_info_sb(x, sb_size, plane); +} + +// Function to generate pixel level gradient information for a given superblock. +// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if +// gradient info is generated for the same. +static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE sb_size, int mi_row, + int mi_col) { + const SPEED_FEATURES *sf = &cpi->sf; + // Initialise flags related to hog data caching. + x->is_sb_gradient_cached[PLANE_TYPE_Y] = false; + x->is_sb_gradient_cached[PLANE_TYPE_UV] = false; + + // SB level caching of gradient data may not help in speedup for the following + // cases: + // (1) Inter frames (due to early intra gating) + // (2) When partition_search_type is not SEARCH_PARTITION + // Hence, gradient data is computed at block level in such cases. + + if (!frame_is_intra_only(&cpi->common) || + sf->part_sf.partition_search_type != SEARCH_PARTITION) + return; + + const int num_planes = av1_num_planes(&cpi->common); + + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size); + + if (sf->intra_sf.intra_pruning_with_hog) { + compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y); + x->is_sb_gradient_cached[PLANE_TYPE_Y] = true; + } + if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) { + compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV); + x->is_sb_gradient_cached[PLANE_TYPE_UV] = true; + } +} + +// Reuses the pixel level gradient data generated at superblock level for block +// level histogram computation. +static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x, + int rows, int cols, + BLOCK_SIZE sb_size, + PLANE_TYPE plane, + float *hist) { + float total = 0.1f; + const int ss_x = x->e_mbd.plane[plane].subsampling_x; + const int ss_y = x->e_mbd.plane[plane].subsampling_y; + const int sb_width = block_size_wide[sb_size] >> ss_x; + + // Derive the offset from the starting of the superblock in order to locate + // the block level gradient data in the cache. + const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1); + const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1); + const int block_offset_in_grad_cache = + sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) + + (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x)); + const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info + + plane * MAX_SB_SQUARE + + block_offset_in_grad_cache; + + // Retrieve the cached gradient information and generate the histogram. + for (int r = 1; r < rows - 1; ++r) { + for (int c = 1; c < cols - 1; ++c) { + const uint16_t abs_dx_abs_dy_sum = + grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum; + if (!abs_dx_abs_dy_sum) continue; + total += abs_dx_abs_dy_sum; + const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero; + if (is_dx_zero) { + hist[0] += abs_dx_abs_dy_sum >> 1; + hist[BINS - 1] += abs_dx_abs_dy_sum >> 1; + } else { + const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx; + assert(idx >= 0 && idx < BINS); + hist[idx] += abs_dx_abs_dy_sum; + } + } + } + normalize_hog(total, hist); } static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize, - int plane, float *hog) { + BLOCK_SIZE sb_size, int plane, float *hog) { const MACROBLOCKD *xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; const int ss_x = pd->subsampling_x; @@ -217,12 +400,15 @@ static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize, const int cols = ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >> ss_x; - const int src_stride = x->plane[plane].src.stride; - const uint8_t *src = x->plane[plane].src.buf; - if (is_cur_buf_hbd(xd)) { - generate_hog_hbd(src, src_stride, rows, cols, hog); + + // If gradient data is already generated at SB level, reuse the cached data. + // Otherwise, compute the data. + if (x->is_sb_gradient_cached[plane]) { + generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog); } else { - generate_hog(src, src_stride, rows, cols, hog); + const uint8_t *src = x->plane[plane].src.buf; + const int src_stride = x->plane[plane].src.stride; + generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd)); } // Scale the hog so the luma and chroma are on the same scale @@ -232,13 +418,13 @@ static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize, } static AOM_INLINE void prune_intra_mode_with_hog( - const MACROBLOCK *x, BLOCK_SIZE bsize, float th, + const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th, uint8_t *directional_mode_skip_mask, int is_chroma) { aom_clear_system_state(); const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y; float hist[BINS] = { 0.0f }; - collect_hog_data(x, bsize, plane, hist); + collect_hog_data(x, bsize, sb_size, plane, hist); // Make prediction for each of the mode float scores[DIRECTIONAL_MODES] = { 0.0f }; @@ -305,7 +491,7 @@ static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi, const int n_cache = av1_get_palette_cache(xd, 0, color_cache); palette_mode_cost += av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache, - n_cache, cpi->common.seq_params.bit_depth); + n_cache, cpi->common.seq_params->bit_depth); palette_mode_cost += av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP); total_rate += palette_mode_cost; @@ -365,7 +551,7 @@ static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi, uint16_t color_cache[2 * PALETTE_MAX_SIZE]; const int n_cache = av1_get_palette_cache(xd, 1, color_cache); palette_mode_cost += av1_palette_color_cost_uv( - pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth); + pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth); palette_mode_cost += av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP); total_rate += palette_mode_cost; @@ -385,11 +571,11 @@ static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi, /*!\cond */ // Makes a quick intra prediction and estimate the rdcost with a model without // going through the whole txfm/quantize/itxfm process. -static int64_t intra_model_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, +static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x, int plane, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size) { - const AV1_COMMON *cm = &cpi->common; + TX_SIZE tx_size, int use_hadamard) { MACROBLOCKD *const xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); int row, col; assert(!is_inter_block(xd->mi[0])); const int stepr = tx_size_high_unit[tx_size]; @@ -405,27 +591,16 @@ static int64_t intra_model_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, for (row = 0; row < max_blocks_high; row += stepr) { for (col = 0; col < max_blocks_wide; col += stepc) { av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); + // Here we use p->src_diff and p->coeff as temporary buffers for + // prediction residue and transform coefficients. The buffers are only + // used in this for loop, therefore we don't need to properly add offset + // to the buffers. av1_subtract_block( - xd, txbh, txbw, p->src_diff, block_size_wide[plane_bsize], + bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize], p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride, pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride); - switch (tx_size) { - case TX_4X4: - aom_hadamard_4x4(p->src_diff, block_size_wide[plane_bsize], p->coeff); - break; - case TX_8X8: - aom_hadamard_8x8(p->src_diff, block_size_wide[plane_bsize], p->coeff); - break; - case TX_16X16: - aom_hadamard_16x16(p->src_diff, block_size_wide[plane_bsize], - p->coeff); - break; - case TX_32X32: - aom_hadamard_32x32(p->src_diff, block_size_wide[plane_bsize], - p->coeff); - break; - default: assert(0); - } + av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff, + block_size_wide[plane_bsize], p->coeff); satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]); } } @@ -448,7 +623,9 @@ static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi, int64_t *best_model_rd) { const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]); const int plane = 0; - const int64_t this_model_rd = intra_model_rd(cpi, x, plane, bsize, tx_size); + const AV1_COMMON *cm = &cpi->common; + const int64_t this_model_rd = + intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1); if (*best_model_rd != INT64_MAX && this_model_rd > *best_model_rd + (*best_model_rd >> 2)) { return 1; diff --git a/third_party/libaom/source/libaom/av1/encoder/level.c b/third_party/libaom/source/libaom/av1/encoder/level.c index 7a74c460e4..4e1749a1dd 100644 --- a/third_party/libaom/source/libaom/av1/encoder/level.c +++ b/third_party/libaom/source/libaom/av1/encoder/level.c @@ -353,7 +353,7 @@ static double time_to_decode_frame(const AV1_COMMON *const cm, if (spatial_layer_dimensions_present_flag) { assert(0 && "Spatial layer dimensions not supported yet."); } else { - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const int max_frame_width = seq_params->max_frame_width; const int max_frame_height = seq_params->max_frame_height; luma_samples = max_frame_width * max_frame_height; @@ -473,7 +473,7 @@ void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level, decoder_model->level = level; const AV1_COMMON *const cm = &cpi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; decoder_model->bit_rate = get_max_bitrate( av1_level_defs + level, seq_params->tier[op_index], seq_params->profile); @@ -690,7 +690,7 @@ void av1_decoder_model_process_frame(const AV1_COMP *const cpi, void av1_init_level_info(AV1_COMP *cpi) { for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) { AV1LevelInfo *const this_level_info = - cpi->level_params.level_info[op_index]; + cpi->ppi->level_params.level_info[op_index]; if (!this_level_info) continue; memset(this_level_info, 0, sizeof(*this_level_info)); AV1LevelSpec *const level_spec = &this_level_info->level_spec; @@ -1048,7 +1048,7 @@ static void scan_past_frames(const FrameWindowBuffer *const buffer, void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, int64_t ts_end) { AV1_COMMON *const cm = &cpi->common; - const AV1LevelParams *const level_params = &cpi->level_params; + const AV1LevelParams *const level_params = &cpi->ppi->level_params; const int upscaled_width = cm->superres_upscaled_width; const int width = cm->width; @@ -1057,7 +1057,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, const int tile_rows = cm->tiles.rows; const int tiles = tile_cols * tile_rows; const int luma_pic_size = upscaled_width * height; - const int frame_header_count = level_params->frame_header_count; + const int frame_header_count = cpi->frame_header_count; const int show_frame = cm->show_frame; const int show_existing_frame = cm->show_existing_frame; @@ -1075,7 +1075,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, const int temporal_layer_id = cm->temporal_layer_id; const int spatial_layer_id = cm->spatial_layer_id; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; const BITSTREAM_PROFILE profile = seq_params->profile; const int is_still_picture = seq_params->still_picture; // update level_stats @@ -1148,7 +1148,7 @@ void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, if (fail_id != TARGET_LEVEL_OK) { const int target_level_major = 2 + (target_level >> 2); const int target_level_minor = target_level & 3; - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Failed to encode to the target level %d_%d. %s", target_level_major, target_level_minor, level_fail_messages[fail_id]); diff --git a/third_party/libaom/source/libaom/av1/encoder/level.h b/third_party/libaom/source/libaom/av1/encoder/level.h index 5e0cce2007..2800e3d40d 100644 --- a/third_party/libaom/source/libaom/av1/encoder/level.h +++ b/third_party/libaom/source/libaom/av1/encoder/level.h @@ -164,8 +164,6 @@ typedef struct AV1LevelParams { uint32_t keep_level_stats; // Level information for each operating point. AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS]; - // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation. - int frame_header_count; } AV1LevelParams; static INLINE int is_in_operating_point(int operating_point, diff --git a/third_party/libaom/source/libaom/av1/encoder/mcomp.c b/third_party/libaom/source/libaom/av1/encoder/mcomp.c index 06f9386102..1a53c23c74 100644 --- a/third_party/libaom/source/libaom/av1/encoder/mcomp.c +++ b/third_party/libaom/source/libaom/av1/encoder/mcomp.c @@ -95,7 +95,7 @@ void av1_make_default_fullpel_ms_params( // High level params ms_params->bsize = bsize; - ms_params->vfp = &cpi->fn_ptr[bsize]; + ms_params->vfp = &cpi->ppi->fn_ptr[bsize]; init_ms_buffers(&ms_params->ms_buffers, x); @@ -145,8 +145,8 @@ void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params, MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params; mv_cost_params->mvjcost = dv_costs->joint_mv; - mv_cost_params->mvcost[0] = &dv_costs->mv_component[0][MV_MAX]; - mv_cost_params->mvcost[1] = &dv_costs->mv_component[1][MV_MAX]; + mv_cost_params->mvcost[0] = dv_costs->dv_costs[0]; + mv_cost_params->mvcost[1] = dv_costs->dv_costs[1]; } void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, @@ -167,7 +167,7 @@ void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params, x->errorperbit, x->sadperbit); // Subpel variance params - ms_params->var_params.vfp = &cpi->fn_ptr[bsize]; + ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize]; ms_params->var_params.subpel_search_type = cpi->sf.mv_sf.use_accurate_subpel_search; ms_params->var_params.w = block_size_wide[bsize]; @@ -253,7 +253,7 @@ static INLINE int mv_cost(const MV *mv, const int *joint_cost, // nearest 2 ** 7. // This is NOT used during motion compensation. int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, - int *mvcost[2], int weight) { + int *const mvcost[2], int weight) { const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col }; return ROUND_POWER_OF_TWO( mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7); @@ -290,6 +290,9 @@ static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv, static INLINE int mv_err_cost_(const MV *mv, const MV_COST_PARAMS *mv_cost_params) { + if (mv_cost_params->mv_cost_type == MV_COST_NONE) { + return 0; + } return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost, mv_cost_params->mvcost, mv_cost_params->error_per_bit, mv_cost_params->mv_cost_type); @@ -1830,7 +1833,7 @@ int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd, const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos), GET_MV_SUBPEL(ref_block_hash.x - x_pos) }; if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize, - cpi->common.seq_params.mib_size_log2)) + cpi->common.seq_params->mib_size_log2)) continue; FULLPEL_MV hash_mv; @@ -1957,8 +1960,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, if (xd->bd != 8) { unsigned int sad; best_int_mv->as_fullmv = kZeroFullMv; - sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, - xd->plane[0].pre[0].buf, ref_stride); + sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); if (scaled_ref_frame) { int i; @@ -2001,7 +2004,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, FULLPEL_MV this_mv = best_int_mv->as_fullmv; src_buf = x->plane[0].src.buf; ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); - best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + best_sad = + cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); { const uint8_t *const pos[4] = { @@ -2011,7 +2015,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, ref_buf + ref_stride, }; - cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); + cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, + this_sad); } for (idx = 0; idx < 4; ++idx) { @@ -2034,7 +2039,8 @@ unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv); - tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + tmp_sad = + cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); if (best_sad > tmp_sad) { best_int_mv->as_fullmv = this_mv; best_sad = tmp_sad; @@ -2265,7 +2271,6 @@ static INLINE int get_subpel_part(int x) { return x & 7; } // Gets the address of the ref buffer at subpel location (r, c), rounded to the // nearest fullpel precision toward - \infty - static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, const MV mv) { const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3); diff --git a/third_party/libaom/source/libaom/av1/encoder/mcomp.h b/third_party/libaom/source/libaom/av1/encoder/mcomp.h index 901671e27f..b2539f5100 100644 --- a/third_party/libaom/source/libaom/av1/encoder/mcomp.h +++ b/third_party/libaom/source/libaom/av1/encoder/mcomp.h @@ -84,7 +84,7 @@ typedef struct { } MV_COST_PARAMS; int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost, - int *mvcost[2], int weight); + int *const mvcost[2], int weight); int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv, diff --git a/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.c b/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.c index 96b77b754d..07485bd68c 100644 --- a/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.c +++ b/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.c @@ -15,6 +15,7 @@ #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" +#include "av1/encoder/interp_search.h" #include "av1/encoder/mcomp.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/partition_strategy.h" @@ -41,7 +42,7 @@ static int compare_weight(const void *a, const void *b) { // Allow more mesh searches for screen content type on the ARF. static int use_fine_search_interval(const AV1_COMP *const cpi) { return cpi->is_screen_content_type && - cpi->gf_group.update_type[cpi->gf_group.index] == ARF_UPDATE && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE && cpi->oxcf.speed <= 2; } @@ -62,15 +63,15 @@ static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi, const int mi_col = xd->mi_col; const BLOCK_SIZE tpl_bsize = - convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d); + convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); const int tplw = mi_size_wide[tpl_bsize]; const int tplh = mi_size_high[tpl_bsize]; const int nw = mi_size_wide[bsize] / tplw; const int nh = mi_size_high[bsize] / tplh; if (nw >= 1 && nh >= 1) { - const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size]; - const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size]; + const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size]; + const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size]; const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw; int valid = 1; @@ -119,7 +120,8 @@ static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi, void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int ref_idx, int *rate_mv, int search_range, inter_mode_info *mode_info, - int_mv *best_mv) { + int_mv *best_mv, + struct HandleInterModeArgs *const args) { MACROBLOCKD *xd = &x->e_mbd; const AV1_COMMON *cm = &cpi->common; const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params; @@ -243,13 +245,9 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, } } - // Terminate search with the current ref_idx if we have already encountered - // another ref_mv in the drl such that: - // 1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION - // search process as the current fullpel_mv. - // 2. The rate needed to encode the current fullpel_mv is larger than that - // for the other ref_mv. - if (cpi->sf.inter_sf.skip_repeated_full_newmv && + // Terminate search with the current ref_idx based on fullpel mv, rate cost, + // and other know cost. + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 && mbmi->motion_mode == SIMPLE_TRANSLATION && best_mv->as_int != INVALID_MV) { int_mv this_mv; @@ -260,6 +258,7 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, mv_costs->mv_cost_stack, MV_COST_WEIGHT); mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int; mode_info[ref_mv_idx].full_mv_rate = this_mv_rate; + mode_info[ref_mv_idx].full_mv_bestsme = bestsme; for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { // Check if the motion search result same as previous results @@ -280,6 +279,19 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, return; } } + + // Terminate the evaluation of current ref_mv_idx based on bestsme and + // drl_cost. + const int psme = mode_info[prev_ref_idx].full_mv_bestsme; + if (psme == INT_MAX) continue; + const int thr = + cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme; + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 && + mode_info[ref_mv_idx].full_mv_bestsme > thr && + mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) { + best_mv->as_int = INVALID_MV; + return; + } } } @@ -289,6 +301,8 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, const int use_fractional_mv = bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0; + int best_mv_rate = 0; + int mv_rate_calculated = 0; if (use_fractional_mv) { int_mv fractional_ms_list[3]; av1_set_fractional_mv(fractional_ms_list); @@ -337,9 +351,10 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv); if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) { + unsigned int sse; const int this_var = mv_search_params->find_fractional_mv_step( xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis, - &x->pred_sse[ref], fractional_ms_list); + &sse, fractional_ms_list); if (!cpi->sf.mv_sf.disable_second_mv) { // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost @@ -358,11 +373,17 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t tmp_rd = RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate, tmp_rd_stats.dist); - if (tmp_rd < rd) best_mv->as_mv = this_best_mv; + if (tmp_rd < rd) { + best_mv->as_mv = this_best_mv; + x->pred_sse[ref] = sse; + } } else { // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the // best MV. - if (this_var < best_mv_var) best_mv->as_mv = this_best_mv; + if (this_var < best_mv_var) { + best_mv->as_mv = this_best_mv; + x->pred_sse[ref] = sse; + } } } } @@ -379,9 +400,52 @@ void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, break; default: assert(0 && "Invalid motion mode!\n"); } + + // Terminate search with the current ref_idx based on subpel mv and rate + // cost. + if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL && + mbmi->motion_mode == SIMPLE_TRANSLATION && + best_mv->as_int != INVALID_MV) { + const int ref_mv_idx = mbmi->ref_mv_idx; + best_mv_rate = + av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); + mv_rate_calculated = 1; + + for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) { + if (!args->single_newmv_valid[prev_ref_idx][ref]) continue; + // Check if the motion vectors are the same. + if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) { + // Skip this evaluation if the previous one is skipped. + if (mode_info[prev_ref_idx].skip) { + mode_info[ref_mv_idx].skip = 1; + break; + } + // Compare the rate cost that we current know. + const int prev_rate_cost = + args->single_newmv_rate[prev_ref_idx][ref] + + mode_info[prev_ref_idx].drl_cost; + const int this_rate_cost = + best_mv_rate + mode_info[ref_mv_idx].drl_cost; + + if (prev_rate_cost <= this_rate_cost) { + // If the current rate_cost is worse than the previous rate_cost, + // then we terminate the search for this ref_mv_idx. + mode_info[ref_mv_idx].skip = 1; + break; + } + } + } + } + } + + if (mv_rate_calculated) { + *rate_mv = best_mv_rate; + } else { + *rate_mv = + av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, + mv_costs->mv_cost_stack, MV_COST_WEIGHT); } - *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost, - mv_costs->mv_cost_stack, MV_COST_WEIGHT); } int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, @@ -920,7 +984,7 @@ int_mv av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, const uint8_t *dst = xd->plane[0].dst.buf; const int dst_stride = xd->plane[0].dst.stride; - *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse); + *var = cpi->ppi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse); return best_mv; } diff --git a/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.h b/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.h index 5736f2b756..bf81fe243a 100644 --- a/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.h +++ b/third_party/libaom/source/libaom/av1/encoder/motion_search_facade.h @@ -21,20 +21,19 @@ extern "C" { // TODO(any): rename this struct to something else. There is already another // struct called inter_modes_info, which makes this terribly confusing. typedef struct { - int64_t rd; int drl_cost; - - int rate_mv; - int_mv mv; - int_mv full_search_mv; int full_mv_rate; + int full_mv_bestsme; + int skip; } inter_mode_info; +struct HandleInterModeArgs; void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int ref_idx, int *rate_mv, int search_range, inter_mode_info *mode_info, - int_mv *best_mv); + int_mv *best_mv, + struct HandleInterModeArgs *const args); int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv, diff --git a/third_party/libaom/source/libaom/av1/encoder/mv_prec.c b/third_party/libaom/source/libaom/av1/encoder/mv_prec.c index cc81d72170..ae9dc35af4 100644 --- a/third_party/libaom/source/libaom/av1/encoder/mv_prec.c +++ b/third_party/libaom/source/libaom/av1/encoder/mv_prec.c @@ -230,7 +230,7 @@ static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats, const int y_stride = cpi->source->y_stride; const int px_row = 4 * mi_row, px_col = 4 * mi_col; const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH; - const int bd = cm->seq_params.bit_depth; + const int bd = cm->seq_params->bit_depth; if (buf_is_hbd) { uint16_t *source_buf = CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col; @@ -339,8 +339,8 @@ static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats, const int mi_row_end = tile_info->mi_row_end; const int mi_col_start = tile_info->mi_col_start; const int mi_col_end = tile_info->mi_col_end; - const int sb_size_mi = cm->seq_params.mib_size; - BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const int sb_size_mi = cm->seq_params->mib_size; + BLOCK_SIZE sb_size = cm->seq_params->sb_size; for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) { for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) { collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size); @@ -349,7 +349,7 @@ static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats, } void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) { - MV_STATS *mv_stats = &cpi->mv_stats; + MV_STATS *mv_stats = &cpi->ppi->mv_stats; const AV1_COMMON *cm = &cpi->common; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; @@ -420,8 +420,8 @@ void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) { } #if !CONFIG_REALTIME_ONLY else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA && - av1_frame_allows_smart_mv(cpi) && cpi->mv_stats.valid) { - use_hp = get_smart_mv_prec(cpi, &cpi->mv_stats, qindex); + av1_frame_allows_smart_mv(cpi) && cpi->ppi->mv_stats.valid) { + use_hp = get_smart_mv_prec(cpi, &cpi->ppi->mv_stats, qindex); } #endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/libaom/source/libaom/av1/encoder/mv_prec.h b/third_party/libaom/source/libaom/av1/encoder/mv_prec.h index 89f95f553e..11dcdd8806 100644 --- a/third_party/libaom/source/libaom/av1/encoder/mv_prec.h +++ b/third_party/libaom/source/libaom/av1/encoder/mv_prec.h @@ -21,8 +21,8 @@ void av1_collect_mv_stats(AV1_COMP *cpi, int current_q); static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) { - const int gf_group_index = cpi->gf_group.index; - const int gf_update_type = cpi->gf_group.update_type[gf_group_index]; + const int gf_group_index = cpi->gf_frame_index; + const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index]; return !frame_is_intra_only(&cpi->common) && !(gf_update_type == INTNL_OVERLAY_UPDATE || gf_update_type == OVERLAY_UPDATE); diff --git a/third_party/libaom/source/libaom/av1/encoder/nonrd_pickmode.c b/third_party/libaom/source/libaom/av1/encoder/nonrd_pickmode.c index 279fd922dd..088135a2dd 100644 --- a/third_party/libaom/source/libaom/av1/encoder/nonrd_pickmode.c +++ b/third_party/libaom/source/libaom/av1/encoder/nonrd_pickmode.c @@ -353,6 +353,8 @@ static INLINE void find_predictors(AV1_COMP *cpi, MACROBLOCK *x, (void)tile_data; x->pred_mv_sad[ref_frame] = INT_MAX; + x->pred_mv0_sad[ref_frame] = INT_MAX; + x->pred_mv1_sad[ref_frame] = INT_MAX; frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; // TODO(kyslov) this needs various further optimizations. to be continued.. assert(yv12 != NULL); @@ -518,7 +520,7 @@ static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize, TX_SIZE tx_size; const TxfmSearchParams *txfm_params = &x->txfm_search_params; if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) { - if (sse > (var << 2)) + if (sse > (var << 1)) tx_size = AOMMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]); @@ -729,9 +731,9 @@ static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize, (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i, i); - var_uv[j] = cpi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride, - puvd->dst.buf, puvd->dst.stride, - &sse_uv[j]); + var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf( + puv->src.buf, puv->src.stride, puvd->dst.buf, puvd->dst.stride, + &sse_uv[j]); if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) skip_uv[j] = 1; @@ -776,8 +778,8 @@ static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize, int rate; int64_t dist; - unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); + unsigned int var = cpi->ppi->fn_ptr[bsize].vf( + p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse); if (calculate_rd) { @@ -1171,8 +1173,8 @@ static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize, unsigned int var; if (!x->color_sensitivity[i - 1]) continue; - var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, - pd->dst.stride, &sse); + var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); assert(sse >= var); tot_sse += sse; @@ -1251,12 +1253,12 @@ static void estimate_block_intra(int plane, int block, int row, int col, (void)block; - p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; - pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; - av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size); av1_invalid_rd_stats(&this_rdc); + p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; + if (plane == 0) { block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, bsize_tx, AOMMIN(tx_size, TX_16X16)); @@ -1562,7 +1564,7 @@ static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc, else model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1); pf_rd_stats[i].rate += av1_get_switchable_rate( - x, xd, cm->features.interp_filter, cm->seq_params.enable_dual_filter); + x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter); cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist); pf_tx_size[i] = mi->tx_size; if (cost < best_cost) { @@ -1618,6 +1620,7 @@ typedef struct _mode_search_stat { static void compute_intra_yprediction(const AV1_COMMON *cm, PREDICTION_MODE mode, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd) { + const SequenceHeader *seq_params = cm->seq_params; struct macroblockd_plane *const pd = &xd->plane[0]; struct macroblock_plane *const p = &x->plane[0]; uint8_t *const src_buf_base = p->src.buf; @@ -1644,10 +1647,11 @@ static void compute_intra_yprediction(const AV1_COMMON *cm, for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; - av1_predict_intra_block(cm, xd, block_size_wide[bsize], - block_size_high[bsize], tx_size, mode, 0, 0, - FILTER_INTRA_MODES, pd->dst.buf, dst_stride, - pd->dst.buf, dst_stride, 0, 0, plane); + av1_predict_intra_block( + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0, + FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride, + 0, 0, plane); } } p->src.buf = src_buf_base; @@ -1671,7 +1675,9 @@ void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, const MB_MODE_INFO *left_mi = xd->left_mbmi; const PREDICTION_MODE A = av1_above_block_mode(above_mi); const PREDICTION_MODE L = av1_left_block_mode(left_mi); - bmode_costs = x->mode_costs.y_mode_costs[A][L]; + const int above_ctx = intra_mode_context[A]; + const int left_ctx = intra_mode_context[L]; + bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx]; av1_invalid_rd_stats(&best_rdc); av1_invalid_rd_stats(&this_rdc); @@ -1734,10 +1740,11 @@ static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x, int *force_skip_low_temp_var) { AV1_COMMON *const cm = &cpi->common; const struct segmentation *const seg = &cm->seg; - const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); // For SVC the usage of alt_ref is determined by the ref_frame_flags. - int use_alt_ref_frame = cpi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame; + int use_alt_ref_frame = + cpi->ppi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame; int use_golden_ref_frame = 1; use_ref_frame[LAST_FRAME] = 1; // we never skip LAST @@ -1832,7 +1839,7 @@ static void estimate_intra_mode( int intra_cost_penalty = av1_get_intra_cost_penalty( quant_params->base_qindex, quant_params->y_dc_delta_q, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0); int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd; // For spatial enhancemanent layer: turn off intra prediction if the @@ -1851,8 +1858,8 @@ static void estimate_intra_mode( // Adjust thresholds to make intra mode likely tested if the other // references (golden, alt) are skipped/not checked. For now always // adjust for svc mode. - if (cpi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 && - cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) { + if (cpi->ppi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 && + cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) { spatial_var_thresh = 150; motion_thresh = 0; } @@ -2063,6 +2070,40 @@ static AOM_INLINE int skip_mode_by_bsize_and_ref_frame( return 0; } +void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + BLOCK_SIZE bsize, int y_sad, + unsigned int source_variance) { + const int factor = (bsize >= BLOCK_32X32) ? 2 : 3; + NOISE_LEVEL noise_level = kLow; + int norm_sad = + y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + // If the spatial source variance is high and the normalized y_sad + // is low, then y-channel is likely good for mode estimation, so keep + // color_sensitivity off. For low noise content for now, since there is + // some bdrate regression for noisy color clip. + if (cpi->noise_estimate.enabled) + noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level == kLow && source_variance > 1000 && norm_sad < 50) { + x->color_sensitivity[0] = 0; + x->color_sensitivity[1] = 0; + return; + } + for (int i = 1; i <= 2; ++i) { + if (x->color_sensitivity[i - 1] == 2) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE bs = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int uv_sad = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); + const int norm_uv_sad = + uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]); + x->color_sensitivity[i - 1] = + uv_sad > (factor * (y_sad >> 3)) && norm_uv_sad > 40; + } + } +} + void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { @@ -2104,7 +2145,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]); PRED_BUFFER *this_mode_pred = NULL; const int reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd && - cm->seq_params.bit_depth == AOM_BITS_8; + cm->seq_params->bit_depth == AOM_BITS_8; const int bh = block_size_high[bsize]; const int bw = block_size_wide[bsize]; @@ -2135,7 +2176,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, cpi->common.height != cpi->resize_pending_params.height)); #endif - + x->color_sensitivity[0] = x->color_sensitivity_sb[0]; + x->color_sensitivity[1] = x->color_sensitivity_sb[1]; init_best_pickmode(&best_pickmode); const ModeCosts *mode_costs = &x->mode_costs; @@ -2170,7 +2212,8 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, #if CONFIG_AV1_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { - // if (cpi->use_svc) denoise_svc_pickmode = av1_denoise_svc_non_key(cpi); + // if (cpi->ppi->use_svc) denoise_svc_pickmode = + // av1_denoise_svc_non_key(cpi); if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) av1_denoiser_reset_frame_stats(ctx); } @@ -2183,7 +2226,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, // to source, so use subpel motion vector to compensate. The nonzero motion // is half pixel shifted to left and top, so (-4, -4). This has more effect // on higher resolutins, so condition it on that for now. - if (cpi->use_svc && svc->spatial_layer_id > 0 && + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 && cm->width * cm->height > 640 * 480) { svc_mv_col = -4; @@ -2210,7 +2253,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, const int use_model_yrd_large = cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block && !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && - quant_params->base_qindex && cm->seq_params.bit_depth == 8; + quant_params->base_qindex && cm->seq_params->bit_depth == 8; const int enable_filter_search = is_filter_search_enabled(cpi, mi_row, mi_col, bsize); @@ -2264,7 +2307,7 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, if (!use_ref_frame_mask[ref_frame]) continue; force_mv_inter_layer = 0; - if (cpi->use_svc && svc->spatial_layer_id > 0 && + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && ((ref_frame == LAST_FRAME && svc->skip_mvsearch_last) || (ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf))) { // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), @@ -2306,6 +2349,10 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred) continue; } } + // Check for skipping NEARMV based on pred_mv_sad. + if (this_mode == NEARMV && x->pred_mv1_sad[ref_frame] != INT_MAX && + x->pred_mv1_sad[ref_frame] > (x->pred_mv0_sad[ref_frame] << 1)) + continue; if (skip_mode_by_threshold( this_mode, ref_frame, frame_mv[this_mode][ref_frame], @@ -2357,6 +2404,22 @@ void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, #if COLLECT_PICK_MODE_STAT ms_stat.num_nonskipped_searches[bsize][this_mode]++; #endif + + if (idx == 0) { + // Set color sensitivity on first tested mode only. + // Use y-sad already computed in find_predictors: take the sad with motion + // vector closest to 0; the uv-sad computed below in set_color_sensitivity + // is for zeromv. + int y_sad = x->pred_mv0_sad[LAST_FRAME]; + if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX && + (abs(frame_mv[NEARMV][LAST_FRAME].as_mv.col) + + abs(frame_mv[NEARMV][LAST_FRAME].as_mv.row)) < + (abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) + + abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.row))) + y_sad = x->pred_mv1_sad[LAST_FRAME]; + set_color_sensitivity(cpi, x, xd, bsize, y_sad, x->source_variance); + } + if (enable_filter_search && !force_mv_inter_layer && ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) && (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) { diff --git a/third_party/libaom/source/libaom/av1/encoder/optical_flow.c b/third_party/libaom/source/libaom/av1/encoder/optical_flow.c index 82ae9c5774..d2f03ed641 100644 --- a/third_party/libaom/source/libaom/av1/encoder/optical_flow.c +++ b/third_party/libaom/source/libaom/av1/encoder/optical_flow.c @@ -819,7 +819,7 @@ static void solve_horn_schunck(const double *ix, const double *iy, } av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height, 2 * width * height, &A); - // substract init mv part from b + // subtract init mv part from b av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height); for (int i = 0; i < 2 * width * height; i++) { b[i] = -temp_b[i]; @@ -882,10 +882,11 @@ static void solve_horn_schunck(const double *ix, const double *iy, } // Calculate optical flow from from_frame to to_frame using the H-S method. -void horn_schunck(const YV12_BUFFER_CONFIG *from_frame, - const YV12_BUFFER_CONFIG *to_frame, const int level, - const int mv_stride, const int mv_height, const int mv_width, - const OPFL_PARAMS *opfl_params, LOCALMV *mvs) { +static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame, + const YV12_BUFFER_CONFIG *to_frame, const int level, + const int mv_stride, const int mv_height, + const int mv_width, const OPFL_PARAMS *opfl_params, + LOCALMV *mvs) { // mvs are always on level 0, here we define two new mv arrays that is of size // of this level. const int fw = from_frame->y_crop_width; diff --git a/third_party/libaom/source/libaom/av1/encoder/palette.c b/third_party/libaom/source/libaom/av1/encoder/palette.c index fd579b7f7f..fbc16ca742 100644 --- a/third_party/libaom/source/libaom/av1/encoder/palette.c +++ b/third_party/libaom/source/libaom/av1/encoder/palette.c @@ -218,12 +218,12 @@ static AOM_INLINE void palette_rd_y( const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n, uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, - uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, - int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, - int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip, - uint8_t *tx_type_map, int *beat_best_palette_rd) { + uint8_t *best_palette_color_map, int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, int *skippable, int *beat_best_rd, + PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip, uint8_t *tx_type_map, + int *beat_best_palette_rd) { optimize_palette_colors(color_cache, n_cache, n, 1, centroids, - cpi->common.seq_params.bit_depth); + cpi->common.seq_params->bit_depth); const int num_unique_colors = av1_remove_duplicates(centroids, n); if (num_unique_colors < PALETTE_MIN_SIZE) { // Too few unique colors to create a palette. And DC_PRED will work @@ -231,10 +231,10 @@ static AOM_INLINE void palette_rd_y( return; } PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; - if (cpi->common.seq_params.use_highbitdepth) { + if (cpi->common.seq_params->use_highbitdepth) { for (int i = 0; i < num_unique_colors; ++i) { pmi->palette_colors[i] = clip_pixel_highbd( - (int)centroids[i], cpi->common.seq_params.bit_depth); + (int)centroids[i], cpi->common.seq_params->bit_depth); } } else { for (int i = 0; i < num_unique_colors; ++i) { @@ -251,10 +251,6 @@ static AOM_INLINE void palette_rd_y( 1); extend_palette_color_map(color_map, cols, rows, block_width, block_height); - if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) { - return; - } - RD_STATS tokenonly_rd_stats; av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd); @@ -304,10 +300,9 @@ static AOM_INLINE int perform_top_color_palette_search( BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors, int start_n, int end_n, int step_size, int *last_n_searched, uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, - uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd, - int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, - int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, - uint8_t *tx_type_map) { + uint8_t *best_palette_color_map, int64_t *best_rd, int *rate, + int *rate_tokenonly, int64_t *distortion, int *skippable, int *beat_best_rd, + PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) { int centroids[PALETTE_MAX_SIZE]; int n = start_n; int top_color_winner = end_n; @@ -320,8 +315,8 @@ static AOM_INLINE int perform_top_color_palette_search( memcpy(centroids, top_colors, n * sizeof(top_colors[0])); palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, distortion, - skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + best_rd, rate, rate_tokenonly, distortion, skippable, + beat_best_rd, ctx, best_blk_skip, tx_type_map, &beat_best_palette_rd); *last_n_searched = n; if (beat_best_palette_rd) { @@ -345,10 +340,9 @@ static AOM_INLINE int perform_k_means_palette_search( int upper_bound, int start_n, int end_n, int step_size, int *last_n_searched, uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, - int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion, - int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, - uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map, - int data_points) { + int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, + int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, + uint8_t *tx_type_map, uint8_t *color_map, int data_points) { int centroids[PALETTE_MAX_SIZE]; const int max_itr = 50; int n = start_n; @@ -366,8 +360,8 @@ static AOM_INLINE int perform_k_means_palette_search( av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr); palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n, color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, distortion, - skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, + best_rd, rate, rate_tokenonly, distortion, skippable, + beat_best_rd, ctx, best_blk_skip, tx_type_map, &beat_best_palette_rd); *last_n_searched = n; if (beat_best_palette_rd) { @@ -434,9 +428,9 @@ static AOM_INLINE void fill_data_and_get_bounds( void av1_rd_pick_palette_intra_sby( const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd, - int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion, - int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx, - uint8_t *best_blk_skip, uint8_t *tx_type_map) { + int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, + int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, + uint8_t *tx_type_map) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; assert(!is_inter_block(mbmi)); @@ -450,7 +444,7 @@ void av1_rd_pick_palette_intra_sby( int block_width, block_height, rows, cols; av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows, &cols); - const SequenceHeader *const seq_params = &cpi->common.seq_params; + const SequenceHeader *const seq_params = cpi->common.seq_params; const int is_hbd = seq_params->use_highbitdepth; const int bit_depth = seq_params->bit_depth; int unused; @@ -532,8 +526,8 @@ void av1_rd_pick_palette_intra_sby( const int top_color_winner = perform_top_color_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1, step_size, &unused, color_cache, n_cache, best_mbmi, - best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly, - distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map); + best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map); // Evaluate neighbors for the winner color (if winner is found) in the // above coarse search for dominant colors if (top_color_winner <= max_n) { @@ -544,18 +538,18 @@ void av1_rd_pick_palette_intra_sby( perform_top_color_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n, stage2_max_n + 1, stage2_step_size, &unused, color_cache, n_cache, - best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate, - rate_tokenonly, distortion, skippable, beat_best_rd, ctx, - best_blk_skip, tx_type_map); + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map); } // K-means clustering. // Perform k-means coarse palette search to find the winner candidate const int k_means_winner = perform_k_means_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, min_n, max_n + 1, step_size, &unused, color_cache, n_cache, best_mbmi, - best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly, - distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, - color_map, rows * cols); + best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, + rows * cols); // Evaluate neighbors for the winner color (if winner is found) in the // above coarse search for k-means if (k_means_winner <= max_n) { @@ -567,9 +561,8 @@ void av1_rd_pick_palette_intra_sby( cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, start_n_stage2, end_n_stage2 + 1, step_size_stage2, &unused, color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd, - best_model_rd, rate, rate_tokenonly, distortion, skippable, - beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map, - rows * cols); + rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx, + best_blk_skip, tx_type_map, color_map, rows * cols); } } else { const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE), @@ -579,17 +572,16 @@ void av1_rd_pick_palette_intra_sby( perform_top_color_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n, min_n - 1, -1, &last_n_searched, color_cache, n_cache, best_mbmi, - best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly, - distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map); + best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map); if (last_n_searched > min_n) { // Search in ascending order until we get to the previous best perform_top_color_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, last_n_searched, 1, &unused, color_cache, n_cache, best_mbmi, - best_palette_color_map, best_rd, best_model_rd, rate, - rate_tokenonly, distortion, skippable, beat_best_rd, ctx, - best_blk_skip, tx_type_map); + best_palette_color_map, best_rd, rate, rate_tokenonly, distortion, + skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map); } // K-means clustering. if (colors == PALETTE_MIN_SIZE) { @@ -599,26 +591,25 @@ void av1_rd_pick_palette_intra_sby( centroids[1] = upper_bound; palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors, color_cache, n_cache, best_mbmi, best_palette_color_map, - best_rd, best_model_rd, rate, rate_tokenonly, distortion, - skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map, - NULL); + best_rd, rate, rate_tokenonly, distortion, skippable, + beat_best_rd, ctx, best_blk_skip, tx_type_map, NULL); } else { // Perform k-means palette search in descending order last_n_searched = max_n; perform_k_means_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, max_n, min_n - 1, -1, &last_n_searched, color_cache, n_cache, - best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate, - rate_tokenonly, distortion, skippable, beat_best_rd, ctx, - best_blk_skip, tx_type_map, color_map, rows * cols); + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, color_map, rows * cols); if (last_n_searched > min_n) { // Search in ascending order until we get to the previous best perform_k_means_palette_search( cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound, min_n, last_n_searched, 1, &unused, color_cache, n_cache, - best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate, - rate_tokenonly, distortion, skippable, beat_best_rd, ctx, - best_blk_skip, tx_type_map, color_map, rows * cols); + best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly, + distortion, skippable, beat_best_rd, ctx, best_blk_skip, + tx_type_map, color_map, rows * cols); } } } @@ -645,7 +636,7 @@ void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x, mbmi->bsize)); PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const BLOCK_SIZE bsize = mbmi->bsize; - const SequenceHeader *const seq_params = &cpi->common.seq_params; + const SequenceHeader *const seq_params = cpi->common.seq_params; int this_rate; int64_t this_rd; int colors_u, colors_v, colors; @@ -737,7 +728,7 @@ void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x, } av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr); optimize_palette_colors(color_cache, n_cache, n, 2, centroids, - cpi->common.seq_params.bit_depth); + cpi->common.seq_params->bit_depth); // Sort the U channel colors in ascending order. for (i = 0; i < 2 * (n - 1); i += 2) { int min_idx = i; @@ -811,7 +802,7 @@ void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) { for (r = 0; r < rows; ++r) { for (c = 0; c < cols; ++c) { - if (cpi->common.seq_params.use_highbitdepth) { + if (cpi->common.seq_params->use_highbitdepth) { data[(r * cols + c) * 2] = src_u16[r * src_stride + c]; data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c]; } else { diff --git a/third_party/libaom/source/libaom/av1/encoder/palette.h b/third_party/libaom/source/libaom/av1/encoder/palette.h index 85af473892..7d9a72f61d 100644 --- a/third_party/libaom/source/libaom/av1/encoder/palette.h +++ b/third_party/libaom/source/libaom/av1/encoder/palette.h @@ -185,10 +185,9 @@ int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi, void av1_rd_pick_palette_intra_sby( const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize, int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, - int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly, - int64_t *distortion, int *skippable, int *beat_best_rd, - struct PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, - uint8_t *tx_type_map); + int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion, + int *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx, + uint8_t *best_blk_skip, uint8_t *tx_type_map); /*!\brief Search for the best palette in the chroma plane. * diff --git a/third_party/libaom/source/libaom/av1/encoder/partition_search.c b/third_party/libaom/source/libaom/av1/encoder/partition_search.c index 5d54a80b36..c5bfaf684f 100644 --- a/third_party/libaom/source/libaom/av1/encoder/partition_search.c +++ b/third_party/libaom/source/libaom/av1/encoder/partition_search.c @@ -25,6 +25,7 @@ #include "av1/encoder/encodemv.h" #include "av1/encoder/motion_search_facade.h" #include "av1/encoder/partition_search.h" +#include "av1/encoder/partition_strategy.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tokenize.h" #include "av1/encoder/var_based_part.h" @@ -34,6 +35,48 @@ #include "av1/encoder/tune_vmaf.h" #endif +void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) { + part_sf->partition_search_type = SEARCH_PARTITION; + part_sf->less_rectangular_check_level = 0; + part_sf->use_square_partition_only_threshold = BLOCK_128X128; + part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; + part_sf->default_max_partition_size = BLOCK_LARGEST; + part_sf->default_min_partition_size = BLOCK_4X4; + part_sf->adjust_var_based_rd_partitioning = 0; + part_sf->allow_partition_search_skip = 0; + part_sf->max_intra_bsize = BLOCK_LARGEST; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + part_sf->fixed_partition_size = BLOCK_16X16; + // Recode loop tolerance %. + part_sf->partition_search_breakout_dist_thr = 0; + part_sf->partition_search_breakout_rate_thr = 0; + part_sf->prune_ext_partition_types_search_level = 0; + part_sf->prune_part4_search = 0; + part_sf->ml_prune_partition = 0; + part_sf->ml_early_term_after_part_split_level = 0; + for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) { + part_sf->ml_partition_search_breakout_thresh[i] = + -1; // -1 means not enabled. + } + part_sf->simple_motion_search_prune_agg = 0; + part_sf->simple_motion_search_split = 0; + part_sf->simple_motion_search_prune_rect = 0; + part_sf->simple_motion_search_early_term_none = 0; + part_sf->simple_motion_search_reduce_search_steps = 0; + part_sf->intra_cnn_split = 0; + part_sf->ext_partition_eval_thresh = BLOCK_8X8; + part_sf->prune_ext_part_using_split_info = 0; + part_sf->prune_rectangular_split_based_on_qidx = 0; + part_sf->early_term_after_none_split = 0; + part_sf->ml_predict_breakout_level = 0; + part_sf->prune_sub_8x8_partition_level = 0; + part_sf->simple_motion_search_rect_split = 0; + part_sf->reuse_prev_rd_results_for_part_ab = 0; + part_sf->reuse_best_prediction_for_part_ab = 0; + part_sf->use_best_rd_for_pruning = 0; +} + static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd, FRAME_COUNTS *counts, TX_SIZE tx_size, int depth, int blk_row, int blk_col, @@ -151,11 +194,14 @@ static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row, const TX_SIZE sub_txs = sub_tx_size_map[tx_size]; const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; - for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { - for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { - const int offsetr = blk_row + row; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { const int offsetc = blk_col + col; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; set_txfm_context(xd, sub_txs, offsetr, offsetc); } } @@ -281,7 +327,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, xd->block_ref_scale_factors[ref], num_planes); } const int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd && - cm->seq_params.bit_depth == AOM_BITS_8) + cm->seq_params->bit_depth == AOM_BITS_8) ? 1 : 0; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, @@ -395,8 +441,8 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, if (!dry_run) { if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 && cpi->sf.rt_sf.use_temporal_noise_estimate && - (!cpi->use_svc || - (cpi->use_svc && + (!cpi->ppi->use_svc || + (cpi->ppi->use_svc && !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize); @@ -590,7 +636,7 @@ static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, RD_STATS *rd_cost, PARTITION_TYPE partition, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, RD_STATS best_rd) { - if (best_rd.rdcost < 0) { + if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) { ctx->rd_stats.rdcost = INT64_MAX; ctx->rd_stats.skip_txfm = 0; av1_invalid_rd_stats(rd_cost); @@ -599,7 +645,8 @@ static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize); - if (ctx->rd_mode_is_ready) { + if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab && + ctx->rd_mode_is_ready) { assert(ctx->mic.bsize == bsize); assert(ctx->mic.partition == partition); rd_cost->rate = ctx->rd_stats.rate; @@ -672,6 +719,13 @@ static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, av1_set_error_per_bit(&x->errorperbit, x->rdmult); av1_rd_cost_update(x->rdmult, &best_rd); + // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous + // rdcost information for the following mode search. + // Disabling the feature could get some coding gain, with encoder slowdown. + if (!cpi->sf.part_sf.use_best_rd_for_pruning) { + av1_invalid_rd_stats(&best_rd); + } + // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB if (frame_is_intra_only(cm)) { @@ -750,11 +804,11 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) { #if CONFIG_ENTROPY_STATS // delta quant applies to both intra and inter const int super_block_upper_left = - ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) && - ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0); + ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag && - (bsize != cm->seq_params.sb_size || !mbmi->skip_txfm) && + (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) && super_block_upper_left) { const int dq = (mbmi->current_qindex - xd->current_base_qindex) / delta_q_info->delta_q_res; @@ -798,10 +852,16 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) { } if (av1_allow_intrabc(cm)) { - update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2); + const int is_intrabc = is_intrabc_block(mbmi); + update_cdf(fc->intrabc_cdf, is_intrabc, 2); #if CONFIG_ENTROPY_STATS - ++td->counts->intrabc[is_intrabc_block(mbmi)]; + ++td->counts->intrabc[is_intrabc]; #endif // CONFIG_ENTROPY_STATS + if (is_intrabc) { + const int_mv dv_ref = x->mbmi_ext_frame->ref_mv_stack[0].this_mv; + av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc, + MV_SUBPEL_NONE); + } } if (frame_is_intra_only(cm) || mbmi->skip_mode) return; @@ -947,7 +1007,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) { } } - if (cm->seq_params.enable_interintra_compound && + if (cm->seq_params->enable_interintra_compound && is_interintra_allowed(mbmi)) { const int bsize_group = size_group_lookup[bsize]; if (mbmi->ref_frame[1] == INTRA_FRAME) { @@ -1008,7 +1068,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) { mbmi->motion_mode == SIMPLE_TRANSLATION); const int masked_compound_used = is_any_masked_compound_used(bsize) && - cm->seq_params.enable_masked_compound; + cm->seq_params->enable_masked_compound; if (masked_compound_used) { const int comp_group_idx_ctx = get_comp_group_idx_context(xd); #if CONFIG_ENTROPY_STATS @@ -1053,7 +1113,7 @@ static void update_stats(const AV1_COMMON *const cm, ThreadData *td) { if (inter_block && cm->features.interp_filter == SWITCHABLE && mbmi->motion_mode != WARPED_CAUSAL && !is_nontrans_global_motion(xd, mbmi)) { - update_filter_type_cdf(xd, mbmi, cm->seq_params.enable_dual_filter); + update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter); } if (inter_block && !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { @@ -1160,8 +1220,8 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, TileInfo *const tile = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; - const int subsampling_x = cm->seq_params.subsampling_x; - const int subsampling_y = cm->seq_params.subsampling_y; + const int subsampling_x = cm->seq_params->subsampling_x; + const int subsampling_y = cm->seq_params->subsampling_y; av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); const int origin_mult = x->rdmult; @@ -1174,9 +1234,9 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y], x->cb_offset[PLANE_TYPE_UV]); assert(x->cb_offset[PLANE_TYPE_Y] < - (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size])); + (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size])); assert(x->cb_offset[PLANE_TYPE_UV] < - ((1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]) >> + ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >> (subsampling_x + subsampling_y))); } @@ -1184,7 +1244,7 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, if (!dry_run) { update_cb_offsets(x, bsize, subsampling_x, subsampling_y); - if (bsize == cpi->common.seq_params.sb_size && mbmi->skip_txfm == 1 && + if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 && cm->delta_q_info.delta_lf_present_flag) { const int frame_lf_count = av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2; @@ -1202,11 +1262,11 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, // delta quant applies to both intra and inter const int super_block_upper_left = - ((mi_row & (cm->seq_params.mib_size - 1)) == 0) && - ((mi_col & (cm->seq_params.mib_size - 1)) == 0); + ((mi_row & (cm->seq_params->mib_size - 1)) == 0) && + ((mi_col & (cm->seq_params->mib_size - 1)) == 0); const DeltaQInfo *const delta_q_info = &cm->delta_q_info; if (delta_q_info->delta_q_present_flag && - (bsize != cm->seq_params.sb_size || !mbmi->skip_txfm) && + (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) && super_block_upper_left) { xd->current_base_qindex = mbmi->current_qindex; if (delta_q_info->delta_lf_present_flag) { @@ -1753,11 +1813,11 @@ void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, // We must have chosen a partitioning and encoding or we'll fail later on. // No other opportunities for success. - if (bsize == cm->seq_params.sb_size) + if (bsize == cm->seq_params->sb_size) assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX); if (do_recon) { - if (bsize == cm->seq_params.sb_size) { + if (bsize == cm->seq_params->sb_size) { // NOTE: To get estimate for rate due to the tokens, use: // int rate_coeffs = 0; // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, @@ -1792,15 +1852,15 @@ static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data, // Nonrd pickmode does not currently support second/combined reference. assert(!has_second_ref(mbmi)); av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run); - const int subsampling_x = cpi->common.seq_params.subsampling_x; - const int subsampling_y = cpi->common.seq_params.subsampling_y; + const int subsampling_x = cpi->common.seq_params->subsampling_x; + const int subsampling_y = cpi->common.seq_params->subsampling_y; if (!dry_run) { set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y], x->cb_offset[PLANE_TYPE_UV]); assert(x->cb_offset[PLANE_TYPE_Y] < - (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size])); + (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size])); assert(x->cb_offset[PLANE_TYPE_UV] < - ((1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]) >> + ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >> (subsampling_x + subsampling_y))); } encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate); @@ -1808,6 +1868,8 @@ static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data, update_cb_offsets(x, bsize, subsampling_x, subsampling_y); if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); } + if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm) + av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize); // TODO(Ravi/Remya): Move this copy function to a better logical place // This function will copy the best mode information from block // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This @@ -1889,8 +1951,8 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, int i; wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync, - &tile_data->tile_info, cm->seq_params.sb_size, - cm->seq_params.mib_size_log2, bsize, mi_row, mi_col); + &tile_data->tile_info, cm->seq_params->sb_size, + cm->seq_params->mib_size_log2, bsize, mi_row, mi_col); #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, rd_pick_sb_modes_time); @@ -1947,6 +2009,30 @@ static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data, end_timing(cpi, av1_rd_pick_inter_mode_sb_time); #endif } + if (cpi->sf.rt_sf.skip_cdef_sb) { + // Find the corresponding 64x64 block. It'll be the 128x128 block if that's + // the block size. + const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64; + const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64; + MB_MODE_INFO **mi_sb = + cm->mi_params.mi_grid_base + + get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb); + // Do not skip if intra or new mv is picked. + const int skip = mi_sb[0]->skip_cdef_curr_sb && + !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV); + // If 128x128 block is used, we need to set the flag for all 4 64x64 sub + // "blocks". + const int block64_in_sb = (bsize == BLOCK_128X128) ? 2 : 1; + for (int r = 0; r < block64_in_sb; ++r) { + for (int c = 0; c < block64_in_sb; ++c) { + const int idx_in_sb = + r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64; + if (mi_sb[idx_in_sb]) mi_sb[idx_in_sb]->skip_cdef_curr_sb = skip; + } + } + // Store in the pickmode context. + ctx->mic.skip_cdef_curr_sb = mi_sb[0]->skip_cdef_curr_sb; + } x->rdmult = orig_rdmult; ctx->rd_stats.rate = rd_cost->rate; ctx->rd_stats.dist = rd_cost->dist; @@ -2301,15 +2387,15 @@ static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td, // Loop over sub-partitions in AB partition type. for (int i = 0; i < SUB_PARTITIONS_AB; i++) { if (mode_cache && mode_cache[i]) { - x->use_intermode_cache = 1; - x->intermode_cache = mode_cache[i]; + x->use_mb_mode_cache = 1; + x->mb_mode_cache = mode_cache[i]; } const int mode_search_success = rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1, ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i], *best_rdc, &sum_rdc, partition, ctxs[i]); - x->use_intermode_cache = 0; - x->intermode_cache = NULL; + x->use_mb_mode_cache = 0; + x->mb_mode_cache = NULL; if (!mode_search_success) { return false; } @@ -2629,7 +2715,8 @@ static void rectangular_partition_search( TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, PartitionSearchState *part_search_state, RD_STATS *best_rdc, - RD_RECT_PART_WIN_INFO *rect_part_win_info) { + RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type, + const RECT_PART_TYPE end_type) { const AV1_COMMON *const cm = &cpi->common; PartitionBlkParams blk_params = part_search_state->part_blk_params; RD_STATS *sum_rdc = &part_search_state->sum_rdc; @@ -2663,7 +2750,7 @@ static void rectangular_partition_search( }; // Loop over rectangular partition types. - for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) { + for (RECT_PART_TYPE i = start_type; i <= end_type; i++) { assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions, !part_search_state->partition_rect_allowed[i])); @@ -2879,7 +2966,8 @@ static void ab_partitions_search( TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx, PC_TREE *pc_tree, PartitionSearchState *part_search_state, RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info, - int pb_source_variance, int ext_partition_allowed) { + int pb_source_variance, int ext_partition_allowed, + const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) { PartitionBlkParams blk_params = part_search_state->part_blk_params; const int mi_row = blk_params.mi_row; const int mi_col = blk_params.mi_col; @@ -2888,9 +2976,9 @@ static void ab_partitions_search( int ab_partitions_allowed[NUM_AB_PARTS] = { 1, 1, 1, 1 }; // Prune AB partitions av1_prune_ab_partitions( - cpi, x, pc_tree, bsize, pb_source_variance, best_rdc->rdcost, - part_search_state->rect_part_rd, part_search_state->split_rd, - rect_part_win_info, ext_partition_allowed, + cpi, x, pc_tree, bsize, mi_row, mi_col, pb_source_variance, + best_rdc->rdcost, part_search_state->rect_part_rd, + part_search_state->split_rd, rect_part_win_info, ext_partition_allowed, part_search_state->partition_rect_allowed[HORZ], part_search_state->partition_rect_allowed[VERT], &ab_partitions_allowed[HORZ_A], &ab_partitions_allowed[HORZ_B], @@ -2946,7 +3034,7 @@ static void ab_partitions_search( }; // Loop over AB partition types. - for (AB_PART_TYPE ab_part_type = 0; ab_part_type < NUM_AB_PARTS; + for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type; ab_part_type++) { const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A; @@ -2956,33 +3044,35 @@ static void ab_partitions_search( continue; blk_params.subsize = get_partition_subsize(bsize, part_type); - for (int i = 0; i < SUB_PARTITIONS_AB; i++) { - // Set AB partition context. - cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc( - cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf); - // Set mode as not ready. - cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0; - } + if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) { + for (int i = 0; i < SUB_PARTITIONS_AB; i++) { + // Set AB partition context. + cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc( + cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf); + // Set mode as not ready. + cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0; + } - // We can copy directly the mode search results if we have already searched - // the current block and the contexts match. - if (is_ctx_ready[ab_part_type][0]) { - av1_copy_tree_context(cur_part_ctxs[ab_part_type][0], - mode_srch_ctx[ab_part_type][0][0]); - cur_part_ctxs[ab_part_type][0]->mic.partition = part_type; - cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1; - if (is_ctx_ready[ab_part_type][1]) { - av1_copy_tree_context(cur_part_ctxs[ab_part_type][1], - mode_srch_ctx[ab_part_type][1][0]); - cur_part_ctxs[ab_part_type][1]->mic.partition = part_type; - cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1; + // We can copy directly the mode search results if we have already + // searched the current block and the contexts match. + if (is_ctx_ready[ab_part_type][0]) { + av1_copy_tree_context(cur_part_ctxs[ab_part_type][0], + mode_srch_ctx[ab_part_type][0][0]); + cur_part_ctxs[ab_part_type][0]->mic.partition = part_type; + cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1; + if (is_ctx_ready[ab_part_type][1]) { + av1_copy_tree_context(cur_part_ctxs[ab_part_type][1], + mode_srch_ctx[ab_part_type][1][0]); + cur_part_ctxs[ab_part_type][1]->mic.partition = part_type; + cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1; + } } } // Even if the contexts don't match, we can still speed up by reusing the // previous prediction mode. const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL }; - if (cpi->sf.inter_sf.reuse_best_prediction_for_part_ab) { + if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) { set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type); } @@ -3180,21 +3270,6 @@ static void prune_4_way_partition_search( part4_search_allowed); } -// Set PARTITION_NONE allowed flag. -static AOM_INLINE void set_part_none_allowed_flag( - AV1_COMP *const cpi, PartitionSearchState *part_search_state) { - PartitionBlkParams blk_params = part_search_state->part_blk_params; - if ((blk_params.width <= blk_params.min_partition_size_1d) && - blk_params.has_rows && blk_params.has_cols) - part_search_state->partition_none_allowed = 1; - assert(part_search_state->terminate_partition_search == 0); - - // Set PARTITION_NONE for screen content. - if (cpi->use_screen_content_tools) - part_search_state->partition_none_allowed = - blk_params.has_rows && blk_params.has_cols; -} - // Set params needed for PARTITION_NONE search. static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td, MACROBLOCK *x, PC_TREE *pc_tree, @@ -3247,11 +3322,10 @@ static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x, bsize <= cpi->sf.part_sf.use_square_partition_only_threshold && bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1; if (use_ml_based_breakout) { - if (av1_ml_predict_breakout(cpi, bsize, x, this_rdc, *pb_source_variance, - xd->bd)) { - part_search_state->do_square_split = 0; - part_search_state->do_rectangular_split = 0; - } + av1_ml_predict_breakout(cpi, bsize, x, this_rdc, blk_params, + *pb_source_variance, xd->bd, + &part_search_state->do_square_split, + &part_search_state->do_rectangular_split); } // Adjust dist breakout threshold according to the partition size. @@ -3329,10 +3403,11 @@ static void prune_partitions_after_split( !part_search_state->terminate_partition_search) { av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm), bsize); - av1_ml_prune_rect_partition( - cpi, x, bsize, best_rdc->rdcost, part_search_state->none_rd, - part_search_state->split_rd, &part_search_state->prune_rect_part[HORZ], - &part_search_state->prune_rect_part[VERT]); + av1_ml_prune_rect_partition(cpi, x, bsize, mi_row, mi_col, best_rdc->rdcost, + part_search_state->none_rd, + part_search_state->split_rd, + &part_search_state->prune_rect_part[HORZ], + &part_search_state->prune_rect_part[VERT]); } } @@ -3351,12 +3426,11 @@ static void none_partition_search( const BLOCK_SIZE bsize = blk_params.bsize; assert(bsize < BLOCK_SIZES_ALL); - // Set PARTITION_NONE allowed flag. - set_part_none_allowed_flag(cpi, part_search_state); if (!part_search_state->partition_none_allowed) return; int pt_cost = 0; RD_STATS best_remain_rdcost; + av1_invalid_rd_stats(&best_remain_rdcost); // Set PARTITION_NONE context and cost. set_none_partition_params(cpi, td, x, pc_tree, part_search_state, @@ -3402,7 +3476,7 @@ static void none_partition_search( if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) { const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame); av1_update_picked_ref_frames_mask( - x, ref_type, bsize, cm->seq_params.mib_size, mi_row, mi_col); + x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col); } // Calculate the total cost and update the best partition. @@ -3553,6 +3627,376 @@ static void split_partition_search( av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); } +// The max number of nodes in the partition tree. +// The number of leaf nodes is (128x128) / (4x4) = 1024. +// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023. +#define NUM_NODES 2048 + +static void write_partition_tree(AV1_COMP *const cpi, + const PC_TREE *const pc_tree, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col) { + (void)mi_row; + (void)mi_col; + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path, + cpi->sb_counter, 0); + ++cpi->sb_counter; + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d", bsize); + + // Write partition type with BFS order. + const PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int q_idx = 0; + int depth = 0; + int last_idx = 1; + int num_nodes = 1; + + // First traversal to get number of leaf nodes and depth. + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + ++depth; + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + const int num_leafs = last_idx; + fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1); + + // Write partitions for each node. + q_idx = 0; + depth = 0; + last_idx = 1; + num_nodes = 1; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + fprintf(pfile, ",%d", node->partitioning); + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + ++depth; + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + fprintf(pfile, "\n"); + + fclose(pfile); +} + +static void verify_write_partition_tree(const AV1_COMP *const cpi, + const PC_TREE *const pc_tree, + const BLOCK_SIZE bsize, + const int config_id, const int mi_row, + const int mi_col) { + (void)mi_row; + (void)mi_col; + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d", + path, cpi->sb_counter, config_id); + FILE *pfile = fopen(filename, "w"); + fprintf(pfile, "%d", bsize); + + // Write partition type with BFS order. + const PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int q_idx = 0; + int depth = 0; + int last_idx = 1; + int num_nodes = 1; + + // First traversal to get number of leaf nodes and depth. + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL && node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + ++depth; + num_nodes += 4; + } + --num_nodes; + ++q_idx; + } + const int num_leafs = last_idx; + fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1); + + // Write partitions for each node. + q_idx = 0; + depth = 0; + last_idx = 1; + num_nodes = 1; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + const PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL) { // suppress warning + fprintf(pfile, ",%d", node->partitioning); + if (node->partitioning == PARTITION_SPLIT) { + for (int i = 0; i < 4; ++i) { + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + ++depth; + num_nodes += 4; + } + } + --num_nodes; + ++q_idx; + } + fprintf(pfile, "\n"); + + fclose(pfile); +} + +static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree, + const int config_id) { + const char *path = cpi->oxcf.partition_info_path; + char filename[256]; + snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path, + cpi->sb_counter, config_id); + FILE *pfile = fopen(filename, "r"); + if (pfile == NULL) { + printf("Can't find the file: %s\n", filename); + exit(0); + } + + int read_bsize; + int num_nodes; + int num_configs; + fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs); + assert(read_bsize == cpi->common.seq_params->sb_size); + BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize; + + PC_TREE *tree_node_queue[NUM_NODES] = { NULL }; + int last_idx = 1; + int q_idx = 0; + tree_node_queue[q_idx] = pc_tree; + while (num_nodes > 0) { + int partitioning; + fscanf(pfile, ",%d", &partitioning); + assert(partitioning >= PARTITION_NONE && + partitioning < EXT_PARTITION_TYPES); + PC_TREE *node = tree_node_queue[q_idx]; + if (node != NULL) node->partitioning = partitioning; + if (partitioning == PARTITION_SPLIT) { + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + for (int i = 0; i < 4; ++i) { + if (node != NULL) { // Suppress warning + node->split[i] = av1_alloc_pc_tree_node(subsize); + node->split[i]->index = i; + tree_node_queue[last_idx] = node->split[i]; + ++last_idx; + } + } + bsize = subsize; + } + --num_nodes; + ++q_idx; + } + fclose(pfile); + + return num_configs; +} + +static RD_STATS rd_search_for_fixed_partition( + AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, + TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col, + const BLOCK_SIZE bsize, PC_TREE *pc_tree) { + const PARTITION_TYPE partition = pc_tree->partitioning; + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + TileInfo *const tile_info = &tile_data->tile_info; + RD_STATS best_rdc; + av1_invalid_rd_stats(&best_rdc); + int sum_subblock_rate = 0; + int64_t sum_subblock_dist = 0; + PartitionSearchState part_search_state; + init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col, + bsize); + // Override partition costs at the edges of the frame in the same + // way as in read_partition (see decodeframe.c). + PartitionBlkParams blk_params = part_search_state.part_blk_params; + if (!(blk_params.has_rows && blk_params.has_cols)) + set_partition_cost_for_edge_blk(cm, &part_search_state); + + av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + // Save rdmult before it might be changed, so it can be restored later. + const int orig_rdmult = x->rdmult; + setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL); + (void)orig_rdmult; + + // Set the context. + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + xd->above_txfm_context = + cm->above_contexts.txfm[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + assert(bsize < BLOCK_SIZES_ALL); + unsigned int pb_source_variance = UINT_MAX; + int64_t part_none_rd = INT64_MAX; + int64_t none_rd = INT64_MAX; + int inc_step[NUM_PART4_TYPES] = { 0 }; + if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4; + if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4; + + switch (partition) { + case PARTITION_NONE: + none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx, + &part_search_state, &best_rdc, &pb_source_variance, + &none_rd, &part_none_rd); + break; + case PARTITION_HORZ: + rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, + &part_search_state, &best_rdc, NULL, HORZ, + HORZ); + break; + case PARTITION_VERT: + rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, + &part_search_state, &best_rdc, NULL, VERT, + VERT); + break; + case PARTITION_HORZ_A: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, HORZ_A, HORZ_A); + break; + case PARTITION_HORZ_B: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, HORZ_B, HORZ_B); + break; + case PARTITION_VERT_A: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, VERT_A, VERT_A); + break; + case PARTITION_VERT_B: + ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + &part_search_state, &best_rdc, NULL, + pb_source_variance, 1, VERT_B, VERT_B); + break; + case PARTITION_HORZ_4: + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->horizontal4, &part_search_state, &best_rdc, + inc_step, PARTITION_HORZ_4); + break; + case PARTITION_VERT_4: + rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, + pc_tree->vertical4, &part_search_state, &best_rdc, + inc_step, PARTITION_VERT_4); + break; + case PARTITION_SPLIT: + for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) { + const BLOCK_SIZE subsize = + get_partition_subsize(bsize, PARTITION_SPLIT); + assert(subsize < BLOCK_SIZES_ALL); + const int next_mi_row = + idx < 2 ? mi_row : mi_row + mi_size_high[subsize]; + const int next_mi_col = + idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize]; + if (next_mi_row >= cm->mi_params.mi_rows || + next_mi_col >= cm->mi_params.mi_cols) { + continue; + } + const RD_STATS subblock_rdc = rd_search_for_fixed_partition( + cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row, + next_mi_col, subsize, pc_tree->split[idx]); + sum_subblock_rate += subblock_rdc.rate; + sum_subblock_dist += subblock_rdc.dist; + } + best_rdc.rate = sum_subblock_rate; + best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT]; + best_rdc.dist = sum_subblock_dist; + best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist); + break; + default: assert(0 && "invalid partition type."); exit(0); + } + // Note: it is necessary to restore context information. + av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (bsize != cm->seq_params->sb_size) { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + x->rdmult = orig_rdmult; + + return best_rdc; +} + +bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, + int mi_col, const BLOCK_SIZE bsize, + RD_STATS *best_rd_cost) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + int best_idx = 0; + int64_t min_rdcost = INT64_MAX; + int num_configs; + RD_STATS *rdcost = NULL; + int i = 0; + do { + PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize); + num_configs = read_partition_tree(cpi, pc_tree, i); + if (i == 0) { + rdcost = aom_calloc(num_configs, sizeof(*rdcost)); + } + if (num_configs <= 0) { + av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0); + if (rdcost != NULL) aom_free(rdcost); + exit(0); + return false; + } + verify_write_partition_tree(cpi, pc_tree, bsize, i, mi_row, mi_col); + // Encode the block with the given partition tree. Get rdcost and encoding + // time. + rdcost[i] = rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, + mi_row, mi_col, bsize, pc_tree); + + if (rdcost[i].rdcost < min_rdcost) { + min_rdcost = rdcost[i].rdcost; + best_idx = i; + *best_rd_cost = rdcost[i]; + } + av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0); + ++i; + } while (i < num_configs); + + // Encode with the partition configuration with the smallest rdcost. + PC_TREE *const pc_tree = av1_alloc_pc_tree_node(bsize); + read_partition_tree(cpi, pc_tree, best_idx); + rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row, + mi_col, bsize, pc_tree); + set_cb_offsets(x->cb_offset, 0, 0); + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + + av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0); + aom_free(rdcost); + ++cpi->sb_counter; + + return true; +} + /*!\brief AV1 block partition search (full search). * * \ingroup partition_search @@ -3617,7 +4061,7 @@ bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, av1_invalid_rd_stats(rd_cost); return part_search_state.found_best_partition; } - if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0; + if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0; // Override skipping rectangular partition operations for edge blocks. if (none_rd) *none_rd = 0; @@ -3742,7 +4186,7 @@ BEGIN_PARTITION_SEARCH: // when NONE and SPLIT partition rd_costs are INT64_MAX. if (cpi->sf.part_sf.early_term_after_none_split && part_none_rd == INT64_MAX && part_split_rd == INT64_MAX && - !x->must_find_valid_partition && (bsize != cm->seq_params.sb_size)) { + !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) { part_search_state.terminate_partition_search = 1; } @@ -3755,7 +4199,7 @@ BEGIN_PARTITION_SEARCH: // Rectangular partitions search stage. rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx, &part_search_state, &best_rdc, - rect_part_win_info); + rect_part_win_info, HORZ, VERT); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, rectangular_partition_search_time); #endif @@ -3784,7 +4228,8 @@ BEGIN_PARTITION_SEARCH: // AB partitions search stage. ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree, &part_search_state, &best_rdc, rect_part_win_info, - pb_source_variance, ext_partition_allowed); + pb_source_variance, ext_partition_allowed, HORZ_A, + VERT_B); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, ab_partitions_search_time); #endif @@ -3832,7 +4277,7 @@ BEGIN_PARTITION_SEARCH: end_timing(cpi, rd_pick_4partition_time); #endif - if (bsize == cm->seq_params.sb_size && + if (bsize == cm->seq_params->sb_size && !part_search_state.found_best_partition) { // Did not find a valid partition, go back and search again, with less // constraint on which partition types to search. @@ -3859,7 +4304,7 @@ BEGIN_PARTITION_SEARCH: // prediction block. print_partition_timing_stats_with_rdcost( part_timing_stats, mi_row, mi_col, bsize, - cpi->gf_group.update_type[cpi->gf_group.index], + cpi->ppi->gf_group.update_type[cpi->gf_frame_index], cm->current_frame.frame_number, &best_rdc, "part_timing.csv"); /* print_partition_timing_stats(part_timing_stats, cm->show_frame, @@ -3881,11 +4326,14 @@ BEGIN_PARTITION_SEARCH: // If a valid partition is found and reconstruction is required for future // sub-blocks in the same group. if (part_search_state.found_best_partition && pc_tree->index != 3) { - if (bsize == cm->seq_params.sb_size) { + if (bsize == cm->seq_params->sb_size) { // Encode the superblock. const int emit_output = multi_pass_mode != SB_DRY_PASS; const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL; + // Write partition tree to file. Not used by default. + if (0) write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col); + set_cb_offsets(x->cb_offset, 0, 0); encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize, pc_tree, NULL); @@ -3907,7 +4355,7 @@ BEGIN_PARTITION_SEARCH: if (pc_tree_dealloc == 0) av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1); - if (bsize == cm->seq_params.sb_size) { + if (bsize == cm->seq_params->sb_size) { assert(best_rdc.rate < INT_MAX); assert(best_rdc.dist < INT64_MAX); } else { @@ -3958,7 +4406,7 @@ static int ml_predict_var_paritioning(AV1_COMP *cpi, MACROBLOCK *x, const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f; float features[FEATURES] = { 0.0f }; const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); int feature_idx = 0; float score[LABELS]; @@ -4038,7 +4486,7 @@ static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, { const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); int feature_idx = 0; features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); @@ -4186,7 +4634,7 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, int partition_none_allowed = !force_horz_split && !force_vert_split; assert(mi_size_wide[bsize] == mi_size_high[bsize]); // Square partition only - assert(cm->seq_params.sb_size == BLOCK_64X64); // Small SB so far + assert(cm->seq_params->sb_size == BLOCK_64X64); // Small SB so far (void)*tp_orig; @@ -4293,7 +4741,7 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree); if (do_recon) { - if (bsize == cm->seq_params.sb_size) { + if (bsize == cm->seq_params->sb_size) { // NOTE: To get estimate for rate due to the tokens, use: // int rate_coeffs = 0; // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, diff --git a/third_party/libaom/source/libaom/av1/encoder/partition_search.h b/third_party/libaom/source/libaom/av1/encoder/partition_search.h index 136548e3e6..8a6717690c 100644 --- a/third_party/libaom/source/libaom/av1/encoder/partition_search.h +++ b/third_party/libaom/source/libaom/av1/encoder/partition_search.h @@ -39,6 +39,13 @@ void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td, RD_STATS *rd_cost, int do_recon, int64_t best_rd, PC_TREE *pc_tree); #endif +void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf); + +bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td, + TileDataEnc *tile_data, TokenExtra **tp, + SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row, + int mi_col, BLOCK_SIZE bsize, + RD_STATS *best_rd_cost); bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TokenExtra **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost, @@ -57,12 +64,14 @@ static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset, static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize, const int subsampling_x, const int subsampling_y) { - const BLOCK_SIZE plane_bsize = - get_plane_block_size(bsize, subsampling_x, subsampling_y); x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize]; - if (x->e_mbd.is_chroma_ref) + if (x->e_mbd.is_chroma_ref) { + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, subsampling_x, subsampling_y); + assert(plane_bsize != BLOCK_INVALID); x->cb_offset[PLANE_TYPE_UV] += block_size_wide[plane_bsize] * block_size_high[plane_bsize]; + } } #endif // AOM_AV1_ENCODER_PARTITION_SEARCH_H_ diff --git a/third_party/libaom/source/libaom/av1/encoder/partition_strategy.c b/third_party/libaom/source/libaom/av1/encoder/partition_strategy.c index f846d595bc..bf678a452f 100644 --- a/third_party/libaom/source/libaom/av1/encoder/partition_strategy.c +++ b/third_party/libaom/source/libaom/av1/encoder/partition_strategy.c @@ -35,6 +35,48 @@ static AOM_INLINE void simple_motion_search_prune_part_features( int mi_row, int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get); +static bool ext_ml_model_decision_before_none( + AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT], + int *partition_none_allowed, int *partition_horz_allowed, + int *partition_vert_allowed, int *do_rectangular_split, + int *do_square_split); + +static bool ext_ml_model_decision_before_none_part2( + AV1_COMP *cpi, + const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART], + int *prune_horz, int *prune_vert); + +static bool ext_ml_model_decision_after_none( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_none, int *do_square_split, + int *do_rectangular_split); + +static bool ext_ml_model_decision_after_none_part2( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search); + +static bool ext_ml_model_decision_after_split( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search); + +static bool ext_ml_model_decision_after_split_part2( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_prune, int *prune_rect_part_horz, + int *prune_rect_part_vert); + +static bool ext_ml_model_decision_after_rect( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_rect, int *horza_partition_allowed, + int *horzb_partition_allowed, int *verta_partition_allowed, + int *vertb_partition_allowed); + +static bool ext_ml_model_decision_after_part_ab( + AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, + int *const partition_vert4_allowed, unsigned int pb_source_variance, + int mi_row, int mi_col); + static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_128X128: return 0; @@ -45,9 +87,45 @@ static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) { default: assert(0 && "Invalid bsize"); return -1; } } -#endif -#if !CONFIG_REALTIME_ONLY +static char *get_feature_file_name(int id) { + static char *feature_file_names[] = { + "feature_before_partition_none", + "feature_before_partition_none_prune_rect", + "feature_after_partition_none_prune", + "feature_after_partition_none_terminate", + "feature_after_partition_split_terminate", + "feature_after_partition_split_prune_rect", + "feature_after_partition_rect", + "feature_after_partition_ab", + }; + + return feature_file_names[id]; +} + +static void write_features_to_file(const char *const path, + const bool is_test_mode, + const float *features, + const int feature_size, const int id, + const int bsize, const int mi_row, + const int mi_col) { + if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return; + + char filename[256]; + snprintf(filename, sizeof(filename), "%s/%s", path, + get_feature_file_name(id)); + FILE *pfile = fopen(filename, "a"); + if (!is_test_mode) { + fprintf(pfile, "%d,%d,%d,%d,%d\n", id, bsize, mi_row, mi_col, feature_size); + } + for (int i = 0; i < feature_size; ++i) { + fprintf(pfile, "%.6f", features[i]); + if (i < feature_size - 1) fprintf(pfile, ","); + } + fprintf(pfile, "\n"); + fclose(pfile); +} + // TODO(chiyotsai@google.com): This is very much a work in progress. We still // need to the following: // -- add support for hdres @@ -61,7 +139,7 @@ void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, int *partition_vert_allowed, int *do_rectangular_split, int *do_square_split) { - assert(cm->seq_params.sb_size >= BLOCK_64X64 && + assert(cm->seq_params->sb_size >= BLOCK_64X64 && "Invalid sb_size for intra_cnn!"); const int bsize_idx = convert_bsize_to_idx(bsize); @@ -284,6 +362,20 @@ void av1_simple_motion_search_based_split( simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, bsize, features, FEATURE_SMS_SPLIT_MODEL_FLAG); + + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col); + + // Note: it is intended to not normalize the features here, to keep it + // consistent for all features collected and passed to the external model. + if (ext_ml_model_decision_before_none( + cpi, features, partition_none_allowed, partition_horz_allowed, + partition_vert_allowed, do_rectangular_split, do_square_split)) { + return; + } + for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) { features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx]; } @@ -308,7 +400,7 @@ void av1_simple_motion_search_based_split( // If the score is very low, prune rectangular split since it is unlikely to // occur. if (cpi->sf.part_sf.simple_motion_search_rect_split) { - const float scale = res_idx >= 2 ? 3 : 2; + const float scale = res_idx >= 2 ? 3.0f : 2.0f; const float rect_split_thresh = scale * av1_simple_motion_search_no_split_thresh [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx] @@ -356,7 +448,7 @@ static int simple_motion_search_get_best_ref( int_mv best_mv = av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes, use_subpixel); - curr_var = cpi->fn_ptr[bsize].vf( + curr_var = cpi->ppi->fn_ptr[bsize].vf( x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride, &curr_sse); if (curr_sse < *best_sse) { @@ -543,6 +635,24 @@ void av1_simple_motion_search_prune_rect( simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col, bsize, features, FEATURE_SMS_PRUNE_PART_FLAG); + + // Note: it is intended to not normalize the features here, to keep it + // consistent for all features collected and passed to the external model. + if (cpi->sf.part_sf.simple_motion_search_prune_rect && + !frame_is_intra_only(cm) && + (partition_horz_allowed || partition_vert_allowed) && + bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) { + // Write features to file + write_features_to_file( + cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode, + features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_before_none_part2(cpi, features, prune_horz, + prune_vert)) { + return; + } + } + for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) { features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; } @@ -617,6 +727,15 @@ void av1_simple_motion_search_early_term_none( assert(0 && "Unexpected block size in simple_motion_term_none"); } + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_none_part2(cpi, features, early_terminate)) { + return; + } + if (ml_model) { float score = 0.0f; for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) { @@ -636,8 +755,9 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, float *features) { AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size. assert(sb_size == BLOCK_128X128); int f_idx = 0; @@ -701,14 +821,18 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, if (log_sse > max_log_sse) max_log_sse = log_sse; } aom_clear_system_state(); - const float avg_mv_row = sum_mv_row / 64.0f; - const float var_mv_row = sum_mv_row_sq / 64.0f - avg_mv_row * avg_mv_row; + const int blks = mb_rows * mb_cols; + const float avg_mv_row = sum_mv_row / (float)blks; + const float var_mv_row = + sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row; - const float avg_mv_col = sum_mv_col / 64.0f; - const float var_mv_col = sum_mv_col_sq / 64.0f - avg_mv_col * avg_mv_col; + const float avg_mv_col = sum_mv_col / (float)blks; + const float var_mv_col = + sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col; - const float avg_log_sse = sum_log_sse / 64.0f; - const float var_log_sse = sum_log_sse_sq / 64.0f - avg_log_sse * avg_log_sse; + const float avg_log_sse = sum_log_sse / (float)blks; + const float var_log_sse = + sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse; features[f_idx++] = avg_log_sse; features[f_idx++] = avg_mv_col; @@ -727,11 +851,20 @@ void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED); } +// Convert result index to block size. +// result idx block size +// 0 BLOCK_16X16 +// 1 BLOCK_32X32 +// 2 BLOCK_64X64 +// 3 BLOCK_128X128 +static BLOCK_SIZE get_block_size(int idx) { + return (BLOCK_SIZE)((idx + 2) * 3); +} + BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, const MACROBLOCK *const x, const float *features) { - float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }, - probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; + float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config; assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != @@ -739,21 +872,26 @@ BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, aom_clear_system_state(); av1_nn_predict(features, nn_config, 1, scores); - av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED); int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == DIRECT_PRED) { result = 0; - float max_prob = probs[0]; + float max_score = scores[0]; for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) { - if (probs[i] > max_prob) { - max_prob = probs[i]; + if (scores[i] > max_score) { + max_score = scores[i]; result = i; } } - } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == - RELAXED_PRED) { + return get_block_size(result); + } + + float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; + av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED); + + if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == + RELAXED_PRED) { for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; --result) { if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { @@ -763,7 +901,7 @@ BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, } } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion == ADAPT_PRED) { - const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size; + const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size; const MACROBLOCKD *const xd = &x->e_mbd; // TODO(debargha): x->source_variance is unavailable at this point, // so compute. The redundant recomputation later can be removed. @@ -784,7 +922,7 @@ BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi, } } - return (BLOCK_SIZE)((result + 2) * 3); + return get_block_size(result); } // Get the minimum partition block width and height(in log scale) under a @@ -911,6 +1049,16 @@ void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, assert(f_idx == FEATURES); + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, FEATURES, + 4, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_split(cpi, features, + terminate_partition_search)) { + return; + } + float score = 0.0f; av1_nn_predict(features, nn_config, 1, &score); // Score is indicator of confidence that we should NOT terminate. @@ -918,10 +1066,11 @@ void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, } #undef FEATURES -void av1_ml_prune_rect_partition(const AV1_COMP *const cpi, - const MACROBLOCK *const x, BLOCK_SIZE bsize, - int64_t best_rd, int64_t none_rd, - int64_t *split_rd, int *const dst_prune_horz, +void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x, + BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int64_t best_rd, + int64_t none_rd, int64_t *split_rd, + int *const dst_prune_horz, int *const dst_prune_vert) { if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return; best_rd = AOMMAX(best_rd, 1); @@ -998,6 +1147,17 @@ void av1_ml_prune_rect_partition(const AV1_COMP *const cpi, for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) features[5 + i] = (float)split_variance[i] / (float)whole_block_variance; + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + /*feature_size=*/9, 5, bsize, mi_row, mi_col); + + if (ext_ml_model_decision_after_split_part2( + &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), + features, dst_prune_horz, dst_prune_vert)) { + return; + } + // 2. Do the prediction and prune 0-2 partitions based on their probabilities float raw_scores[3] = { 0.0f }; av1_nn_predict(features, nn_config, 1, raw_scores); @@ -1014,7 +1174,8 @@ void av1_ml_prune_rect_partition(const AV1_COMP *const cpi, // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be // considered. void av1_ml_prune_ab_partition( - BLOCK_SIZE bsize, int part_ctx, int var_ctx, int64_t best_rd, + AV1_COMP *const cpi, BLOCK_SIZE bsize, const int mi_row, const int mi_col, + int part_ctx, int var_ctx, int64_t best_rd, int64_t horz_rd[SUB_PARTITIONS_RECT], int64_t vert_rd[SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const horza_partition_allowed, int *const horzb_partition_allowed, int *const verta_partition_allowed, @@ -1065,6 +1226,20 @@ void av1_ml_prune_ab_partition( } assert(feature_index == 10); + // Write features to file + if (!frame_is_intra_only(&cpi->common)) { + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + /*feature_size=*/10, 6, bsize, mi_row, mi_col); + } + + if (ext_ml_model_decision_after_rect( + &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), + features, horza_partition_allowed, horzb_partition_allowed, + verta_partition_allowed, vertb_partition_allowed)) { + return; + } + // Calculate scores using the NN model. float score[16] = { 0.0f }; av1_nn_predict(features, nn_config, 1, score); @@ -1101,12 +1276,17 @@ void av1_ml_prune_ab_partition( #define LABELS 4 // Use a ML model to predict if horz4 and vert4 should be considered. void av1_ml_prune_4_partition( - const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, - int part_ctx, int64_t best_rd, - int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, int *const partition_vert4_allowed, unsigned int pb_source_variance, int mi_row, int mi_col) { + if (ext_ml_model_decision_after_part_ab( + cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd, + partition_horz4_allowed, partition_vert4_allowed, pb_source_variance, + mi_row, mi_col)) + return; + if (best_rd >= 1000000000) return; int64_t *horz_rd = rect_part_rd[HORZ]; int64_t *vert_rd = rect_part_rd[VERT]; @@ -1206,6 +1386,13 @@ void av1_ml_prune_4_partition( } assert(feature_index == FEATURES); + // Write features to file + if (!frame_is_intra_only(&cpi->common)) { + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, + FEATURES, 7, bsize, mi_row, mi_col); + } + // Calculate scores using the NN model. float score[LABELS] = { 0.0f }; av1_nn_predict(features, nn_config, 1, score); @@ -1238,10 +1425,12 @@ void av1_ml_prune_4_partition( #undef LABELS #define FEATURES 4 -int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, - const MACROBLOCK *const x, - const RD_STATS *const rd_stats, - unsigned int pb_source_variance, int bit_depth) { +void av1_ml_predict_breakout(AV1_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + const PartitionBlkParams blk_params, + unsigned int pb_source_variance, int bit_depth, + int *do_square_split, int *do_rectangular_split) { const NN_CONFIG *nn_config = NULL; int thresh = 0; switch (bsize) { @@ -1267,7 +1456,7 @@ int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, break; default: assert(0 && "Unexpected bsize."); } - if (!nn_config || thresh < 0) return 0; + if (!nn_config || thresh < 0) return; const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f }; thresh = (int)((float)thresh * @@ -1295,13 +1484,28 @@ int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, features[feature_index++] = (float)(dc_q * dc_q) / 256.0f; assert(feature_index == FEATURES); + // Write features to file + write_features_to_file(cpi->oxcf.partition_info_path, + cpi->ext_part_controller.test_mode, features, FEATURES, + 2, blk_params.bsize, blk_params.mi_row, + blk_params.mi_col); + + if (ext_ml_model_decision_after_none( + &cpi->ext_part_controller, frame_is_intra_only(&cpi->common), + features, do_square_split, do_rectangular_split)) { + return; + } + // Calculate score using the NN model. float score = 0.0f; av1_nn_predict(features, nn_config, 1, &score); aom_clear_system_state(); // Make decision. - return (int)(score * 100) >= thresh; + if ((int)(score * 100) >= thresh) { + *do_square_split = 0; + *do_rectangular_split = 0; + } } #undef FEATURES @@ -1361,7 +1565,7 @@ void av1_prune_partitions_before_search( const int try_intra_cnn_split = !cpi->use_screen_content_tools && frame_is_intra_only(cm) && cpi->sf.part_sf.intra_cnn_split && - cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 && + cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 && bsize >= BLOCK_8X8 && mi_row + mi_size_high[bsize] <= mi_params->mi_rows && mi_col + mi_size_wide[bsize] <= mi_params->mi_cols; @@ -1483,8 +1687,9 @@ int evaluate_ab_partition_based_on_split( } void av1_prune_ab_partitions( - const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree, - BLOCK_SIZE bsize, int pb_source_variance, int64_t best_rdcost, + AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree, + BLOCK_SIZE bsize, const int mi_row, const int mi_col, + int pb_source_variance, int64_t best_rdcost, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], const RD_RECT_PART_WIN_INFO *rect_part_win_info, int ext_partition_allowed, @@ -1580,7 +1785,7 @@ void av1_prune_ab_partitions( // TODO(huisu@google.com): x->source_variance may not be the current // block's variance. The correct one to use is pb_source_variance. Need to // re-train the model to fix it. - av1_ml_prune_ab_partition(bsize, pc_tree->partitioning, + av1_ml_prune_ab_partition(cpi, bsize, mi_row, mi_col, pc_tree->partitioning, get_unsigned_bits(x->source_variance), best_rdcost, horz_rd, vert_rd, split_rd, horza_partition_allowed, horzb_partition_allowed, @@ -1617,4 +1822,390 @@ void av1_prune_ab_partitions( } } +// Prepare features for the external model. Specifically, features after +// ab partition is searched. +static void prepare_features_after_part_ab( + const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, + int part_ctx, int64_t best_rd, + int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance, + int mi_row, int mi_col, aom_partition_features_t *const features) { + int64_t *horz_rd = rect_part_rd[HORZ]; + int64_t *vert_rd = rect_part_rd[VERT]; + + aom_clear_system_state(); + + // Generate features. + int feature_index = 0; + features->after_part_ab.f[feature_index++] = (float)part_ctx; + features->after_part_ab.f[feature_index++] = + (float)get_unsigned_bits(pb_source_variance); + + const int rdcost = (int)AOMMIN(INT_MAX, best_rd); + int sub_block_rdcost[8] = { 0 }; + int rd_index = 0; + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (horz_rd[i] > 0 && horz_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)horz_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) { + if (vert_rd[i] > 0 && vert_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)vert_rd[i]; + ++rd_index; + } + for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) { + if (split_rd[i] > 0 && split_rd[i] < 1000000000) + sub_block_rdcost[rd_index] = (int)split_rd[i]; + ++rd_index; + } + for (int i = 0; i < 8; ++i) { + // Ratio between the sub-block RD and the whole-block RD. + float rd_ratio = 1.0f; + if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost) + rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost; + features->after_part_ab.f[feature_index++] = rd_ratio; + } + + // Get variance of the 1:4 and 4:1 sub-blocks. + unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 }; + { + BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4); + BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4); + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, + av1_num_planes(&cpi->common), bsize); + const int src_stride = x->plane[0].src.stride; + uint8_t *src = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + + struct buf_2d horz_4_src, vert_4_src; + horz_4_src.stride = src_stride; + vert_4_src.stride = src_stride; + + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride; + vert_4_src.buf = src + i * block_size_wide[vert_4_bs]; + + if (is_cur_buf_hbd(xd)) { + horz_4_source_var[i] = av1_high_get_sby_perpixel_variance( + cpi, &horz_4_src, horz_4_bs, xd->bd); + vert_4_source_var[i] = av1_high_get_sby_perpixel_variance( + cpi, &vert_4_src, vert_4_bs, xd->bd); + } else { + horz_4_source_var[i] = + av1_get_sby_perpixel_variance(cpi, &horz_4_src, horz_4_bs); + vert_4_source_var[i] = + av1_get_sby_perpixel_variance(cpi, &vert_4_src, vert_4_bs); + } + } + } + + const float denom = (float)(pb_source_variance + 1); + const float low_b = 0.1f; + const float high_b = 10.0f; + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 4:1 sub-block variance and the whole-block variance. + float var_ratio = (float)(horz_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features->after_part_ab.f[feature_index++] = var_ratio; + } + for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) { + // Ratio between the 1:4 sub-block RD and the whole-block RD. + float var_ratio = (float)(vert_4_source_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features->after_part_ab.f[feature_index++] = var_ratio; + } + assert(feature_index == 18); +} + +// If the external partition model is used, we let it determine partition +// decisions before partition none. Specifically, these parameters: +// partition_none_allowed +// partition_horz_allowed +// partition_vert_allowed +// do_rectangular_split +// do_square_split +static bool ext_ml_model_decision_before_none( + AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT], + int *partition_none_allowed, int *partition_horz_allowed, + int *partition_vert_allowed, int *do_rectangular_split, + int *do_square_split) { + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = FEATURE_BEFORE_PART_NONE; + for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) { + features.before_part_none.f[i] = features_from_motion[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *partition_none_allowed = decision.partition_none_allowed; + *partition_horz_allowed = decision.partition_rect_allowed[HORZ]; + *partition_vert_allowed = decision.partition_rect_allowed[VERT]; + *do_rectangular_split = decision.do_rectangular_split; + *do_square_split = decision.do_square_split; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions before partition none. Specifically, these parameters: +// prune_horz +// prune_vert +static bool ext_ml_model_decision_before_none_part2( + AV1_COMP *cpi, + const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART], + int *prune_horz, int *prune_vert) { + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = FEATURE_BEFORE_PART_NONE_PART2; + for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) { + features.before_part_none.f_part2[i] = features_from_motion[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *prune_horz = decision.prune_rect_part[HORZ]; + *prune_vert = decision.prune_rect_part[VERT]; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// do_square_split +// do_rectangular_split +bool ext_ml_model_decision_after_none( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_none, int *do_square_split, + int *do_rectangular_split) { + if (!ext_part_controller->ready || is_intra_frame) return false; + + // Setup features. + aom_partition_features_t features; + features.id = FEATURE_AFTER_PART_NONE; + for (int i = 0; i < 4; ++i) { + features.after_part_none.f[i] = features_after_none[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *do_square_split = decision.do_square_split; + *do_rectangular_split = decision.do_rectangular_split; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// terminate_partition_search +bool ext_ml_model_decision_after_none_part2( + AV1_COMP *const cpi, const float *const features_terminate, + int *terminate_partition_search) { + AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false; + + // Setup features. + aom_partition_features_t features; + features.id = FEATURE_AFTER_PART_NONE_PART2; + for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) { + features.after_part_none.f_terminate[i] = features_terminate[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *terminate_partition_search = decision.terminate_partition_search; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// terminate_partition_search +bool ext_ml_model_decision_after_split(AV1_COMP *const cpi, + const float *const features_terminate, + int *terminate_partition_search) { + const AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) { + return false; + } + + // Setup features. + aom_partition_features_t features; + features.id = FEATURE_AFTER_PART_SPLIT; + for (int i = 0; i < 31; ++i) { + features.after_part_split.f_terminate[i] = features_terminate[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *terminate_partition_search = decision.terminate_partition_search; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after none partition. Specifically, these parameters: +// prune_rect_part[HORZ] +// prune_rect_part[VERT] +bool ext_ml_model_decision_after_split_part2( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_prune, int *prune_rect_part_horz, + int *prune_rect_part_vert) { + if (is_intra_frame || !ext_part_controller->ready) { + return false; + } + + // Setup features. + aom_partition_features_t features; + features.id = FEATURE_AFTER_PART_SPLIT_PART2; + for (int i = 0; i < 9; ++i) { + features.after_part_split.f_prune_rect[i] = features_prune[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *prune_rect_part_horz = decision.prune_rect_part[0]; + *prune_rect_part_vert = decision.prune_rect_part[1]; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after rectangular partition. Specifically, these parameters: +// horza_partition_allowed +// horzb_partition_allowed +// verta_partition_allowed +// vertb_partition_allowed +static bool ext_ml_model_decision_after_rect( + ExtPartController *const ext_part_controller, const int is_intra_frame, + const float *const features_after_rect, int *horza_partition_allowed, + int *horzb_partition_allowed, int *verta_partition_allowed, + int *vertb_partition_allowed) { + if (is_intra_frame || !ext_part_controller->ready) return false; + + // Setup features. + aom_partition_features_t features; + features.id = FEATURE_AFTER_PART_RECT; + for (int i = 0; i < 10; ++i) { + features.after_part_rect.f[i] = features_after_rect[i]; + } + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *horza_partition_allowed = decision.horza_partition_allowed; + *horzb_partition_allowed = decision.horzb_partition_allowed; + *verta_partition_allowed = decision.verta_partition_allowed; + *vertb_partition_allowed = decision.vertb_partition_allowed; + + return true; +} + +// If the external partition model is used, we let it determine partition +// decisions after AB partition. Specifically, these parameters: +// partition_vert4_allowed +// partition_horz4_allowed +static bool ext_ml_model_decision_after_part_ab( + AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, + int *const partition_vert4_allowed, unsigned int pb_source_variance, + int mi_row, int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + ExtPartController *const ext_part_controller = &cpi->ext_part_controller; + + if (!frame_is_intra_only(cm) && ext_part_controller->ready) { + // Setup features. + aom_partition_features_t features; + features.id = FEATURE_AFTER_PART_AB; + prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd, + rect_part_rd, split_rd, pb_source_variance, + mi_row, mi_col, &features); + + // Send necessary features to the external model. + av1_ext_part_send_features(ext_part_controller, &features); + + // Get partition decisions from the external model. + aom_partition_decision_t decision; + const bool valid_decision = + av1_ext_part_get_partition_decision(ext_part_controller, &decision); + if (!valid_decision) return false; + + // Populate decisions + *partition_horz4_allowed = decision.partition_horz4_allowed; + *partition_vert4_allowed = decision.partition_vert4_allowed; + + return true; + } + + return false; +} + #endif // !CONFIG_REALTIME_ONLY diff --git a/third_party/libaom/source/libaom/av1/encoder/partition_strategy.h b/third_party/libaom/source/libaom/av1/encoder/partition_strategy.h index 0527a944cd..ed66a364d9 100644 --- a/third_party/libaom/source/libaom/av1/encoder/partition_strategy.h +++ b/third_party/libaom/source/libaom/av1/encoder/partition_strategy.h @@ -13,58 +13,10 @@ #define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ #include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodeframe_utils.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/encoder.h" -#define FEATURE_SIZE_SMS_SPLIT_FAST 6 -#define FEATURE_SIZE_SMS_SPLIT 17 -#define FEATURE_SIZE_SMS_PRUNE_PART 25 -#define FEATURE_SIZE_SMS_TERM_NONE 28 -#define FEATURE_SIZE_FP_SMS_TERM_NONE 20 -#define FEATURE_SIZE_MAX_MIN_PART_PRED 13 -#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4 - -#define FEATURE_SMS_NONE_FLAG 1 -#define FEATURE_SMS_SPLIT_FLAG (1 << 1) -#define FEATURE_SMS_RECT_FLAG (1 << 2) - -#define FEATURE_SMS_PRUNE_PART_FLAG \ - (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG) -#define FEATURE_SMS_SPLIT_MODEL_FLAG \ - (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG) - -// Number of sub-partitions in rectangular partition types. -#define SUB_PARTITIONS_RECT 2 - -// Number of sub-partitions in split partition type. -#define SUB_PARTITIONS_SPLIT 4 - -// Number of sub-partitions in AB partition types. -#define SUB_PARTITIONS_AB 3 - -// Number of sub-partitions in 4-way partition types. -#define SUB_PARTITIONS_PART4 4 - -// 4part parition types. -enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES); - -// AB parition types. -enum { - HORZ_A = 0, - HORZ_B, - VERT_A, - VERT_B, - NUM_AB_PARTS -} UENUM1BYTE(AB_PART_TYPE); - -// Rectangular parition types. -enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE); - -// Structure to keep win flags for HORZ and VERT partition evaluations. -typedef struct { - int rect_part_win[NUM_RECT_PARTS]; -} RD_RECT_PART_WIN_INFO; - void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x, int bsize, int label_idx, int *partition_none_allowed, @@ -129,16 +81,18 @@ void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x, // no information about rectangular partitions. Preliminary experiments suggest // that we can get better performance by adding in q_index and rectangular // sse/var from SMS. We should retrain and tune this model later. -void av1_ml_prune_rect_partition(const AV1_COMP *const cpi, - const MACROBLOCK *const x, BLOCK_SIZE bsize, - int64_t best_rd, int64_t none_rd, - int64_t *split_rd, int *const dst_prune_horz, +void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x, + BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int64_t best_rd, + int64_t none_rd, int64_t *split_rd, + int *const dst_prune_horz, int *const dst_prune_vert); // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be // considered. void av1_ml_prune_ab_partition( - BLOCK_SIZE bsize, int part_ctx, int var_ctx, int64_t best_rd, + AV1_COMP *const cpi, BLOCK_SIZE bsize, const int mi_row, const int mi_col, + int part_ctx, int var_ctx, int64_t best_rd, int64_t horz_rd[SUB_PARTITIONS_RECT], int64_t vert_rd[SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const horza_partition_allowed, int *const horzb_partition_allowed, int *const verta_partition_allowed, @@ -146,18 +100,19 @@ void av1_ml_prune_ab_partition( // Use a ML model to predict if horz4 and vert4 should be considered. void av1_ml_prune_4_partition( - const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, - int part_ctx, int64_t best_rd, - int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], + AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx, + int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed, int *const partition_vert4_allowed, unsigned int pb_source_variance, int mi_row, int mi_col); // ML-based partition search breakout after PARTITION_NONE. -int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, - const MACROBLOCK *const x, - const RD_STATS *const rd_stats, - unsigned int pb_source_variance, int bit_depth); +void av1_ml_predict_breakout(AV1_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + const PartitionBlkParams blk_params, + unsigned int pb_source_variance, int bit_depth, + int *do_square_split, int *do_rectangular_split); // The first round of partition pruning determined before any partition // has been tested. The decisions will be updated and passed back @@ -183,8 +138,9 @@ void av1_prune_partitions_by_max_min_bsize( // Prune out AB partitions based on rd decisions made from testing the // basic partitions. void av1_prune_ab_partitions( - const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree, - BLOCK_SIZE bsize, int pb_source_variance, int64_t best_rdcost, + AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree, + BLOCK_SIZE bsize, const int mi_row, const int mi_col, + int pb_source_variance, int64_t best_rdcost, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT], int64_t split_rd[SUB_PARTITIONS_SPLIT], const RD_RECT_PART_WIN_INFO *rect_part_win_info, int ext_partition_allowed, @@ -261,22 +217,66 @@ static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params, (mi_col + sb_mi_wide) <= mi_params->mi_cols; } +#if !CONFIG_REALTIME_ONLY // Do not use this criteria for screen content videos. // Since screen content videos could often find good predictors and the largest // block size is likely to be used. static INLINE int use_auto_max_partition(const AV1_COMP *const cpi, BLOCK_SIZE sb_size, int mi_row, int mi_col) { - assert(IMPLIES(cpi->gf_group.size > 0, - cpi->gf_group.index < cpi->gf_group.size)); + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); const AV1_COMMON *const cm = &cpi->common; return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools && cpi->sf.part_sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE && sb_size == BLOCK_128X128 && is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) && - cpi->gf_group.update_type[cpi->gf_group.index] != OVERLAY_UPDATE && - cpi->gf_group.update_type[cpi->gf_group.index] != INTNL_OVERLAY_UPDATE; + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] != + OVERLAY_UPDATE && + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] != + INTNL_OVERLAY_UPDATE; } +static BLOCK_SIZE dim_to_size(int dim) { + switch (dim) { + case 4: return BLOCK_4X4; + case 8: return BLOCK_8X8; + case 16: return BLOCK_16X16; + case 32: return BLOCK_32X32; + case 64: return BLOCK_64X64; + case 128: return BLOCK_128X128; + default: assert(0); return 0; + } +} + +static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc, + AV1_COMP *cpi, MACROBLOCK *x, + const SPEED_FEATURES *sf, + BLOCK_SIZE sb_size, + int mi_row, int mi_col) { + const AV1_COMMON *cm = &cpi->common; + + sb_enc->max_partition_size = + AOMMIN(sf->part_sf.default_max_partition_size, + dim_to_size(cpi->oxcf.part_cfg.max_partition_size)); + sb_enc->min_partition_size = + AOMMAX(sf->part_sf.default_min_partition_size, + dim_to_size(cpi->oxcf.part_cfg.min_partition_size)); + sb_enc->max_partition_size = + AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size); + sb_enc->min_partition_size = + AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size); + + if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) { + float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f }; + + av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features); + sb_enc->max_partition_size = + AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features), + sb_enc->max_partition_size), + sb_enc->min_partition_size); + } +} +#endif // !CONFIG_REALTIME_ONLY #endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ diff --git a/third_party/libaom/source/libaom/av1/encoder/pass2_strategy.c b/third_party/libaom/source/libaom/av1/encoder/pass2_strategy.c index 804fb3a510..e3639f7784 100644 --- a/third_party/libaom/source/libaom/av1/encoder/pass2_strategy.c +++ b/third_party/libaom/source/libaom/av1/encoder/pass2_strategy.c @@ -43,6 +43,13 @@ #define DEFAULT_KF_BOOST 2300 #define DEFAULT_GF_BOOST 2000 #define GROUP_ADAPTIVE_MAXQ 1 + +static INLINE int is_fp_stats_to_predict_flat_gop_invalid( + const FIRSTPASS_STATS *fp_stats) { + return ((fp_stats->tr_coded_error < 0) || (fp_stats->pcnt_third_ref < 0) || + (fp_stats->frame_avg_wavelet_energy < 0)); +} + static void init_gf_stats(GF_GROUP_STATS *gf_stats); // Calculate an active area of the image that discounts formatting @@ -182,7 +189,7 @@ static double calc_correction_factor(double err_per_mb, int q) { // Based on history adjust expectations of bits per macroblock. static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) { - TWO_PASS *twopass = &cpi->twopass; + TWO_PASS *twopass = &cpi->ppi->twopass; const RATE_CONTROL *const rc = &cpi->rc; int err_estimate = rc->rate_error_estimate; @@ -194,14 +201,14 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) { const double max_fac = 1.0 + adj_limit; if (rc->vbr_bits_off_target && rc->total_actual_bits > 0) { - if (cpi->lap_enabled) { + if (cpi->ppi->lap_enabled) { rate_err_factor = (double)twopass->rolling_arf_group_actual_bits / DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); } else { rate_err_factor = 1.0 - ((double)(rc->vbr_bits_off_target) / - AOMMAX(rc->total_actual_bits, cpi->twopass.bits_left)); + AOMMAX(rc->total_actual_bits, cpi->ppi->twopass.bits_left)); } rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor)); @@ -209,7 +216,7 @@ static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) { // Adjustment is damped if this is 1 pass with look ahead processing // (as there are only ever a few frames of data) and for all but the first // GOP in normal two pass. - if ((twopass->bpm_factor != 1.0) || cpi->lap_enabled) { + if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) { rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac); } } @@ -302,9 +309,9 @@ static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err, // Try and pick a max Q that will be high enough to encode the // content at the given rate. int q = find_qindex_by_rate_with_correction( - target_norm_bits_per_mb, cpi->common.seq_params.bit_depth, - av_err_per_mb, cpi->twopass.bpm_factor, rate_err_tol, rc->best_quality, - rc->worst_quality); + target_norm_bits_per_mb, cpi->common.seq_params->bit_depth, + av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol, + rc->best_quality, rc->worst_quality); // Restriction on active max q for constrained quality mode. if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level); @@ -312,57 +319,63 @@ static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err, } } -#define SR_DIFF_PART 0.0015 -#define MOTION_AMP_PART 0.003 #define INTRA_PART 0.005 #define DEFAULT_DECAY_LIMIT 0.75 #define LOW_SR_DIFF_TRHESH 0.1 -#define SR_DIFF_MAX 128.0 #define NCOUNT_FRAME_II_THRESH 5.0 +#define LOW_CODED_ERR_PER_MB 10.0 -static double get_sr_decay_rate(const FRAME_INFO *frame_info, - const FIRSTPASS_STATS *frame) { - const int num_mbs = frame_info->num_mbs; - double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs; +/* This function considers how the quality of prediction may be deteriorating + * with distance. It comapres the coded error for the last frame and the + * second reference frame (usually two frames old) and also applies a factor + * based on the extent of INTRA coding. + * + * The decay factor is then used to reduce the contribution of frames further + * from the alt-ref or golden frame, to the bitframe boost calculation for that + * alt-ref or golden frame. + */ +static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) { + double sr_diff = (frame->sr_coded_error - frame->coded_error); double sr_decay = 1.0; double modified_pct_inter; double modified_pcnt_intra; - const double motion_amplitude_factor = - frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2); modified_pct_inter = frame->pcnt_inter; - if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < - (double)NCOUNT_FRAME_II_THRESH) { + if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && + ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH)) { modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; } modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); if ((sr_diff > LOW_SR_DIFF_TRHESH)) { - sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX); - sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - - (MOTION_AMP_PART * motion_amplitude_factor) - - (INTRA_PART * modified_pcnt_intra); + double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error); + sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra); } - return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter)); + return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT); } // This function gives an estimate of how badly we believe the prediction // quality is decaying from frame to frame. -static double get_zero_motion_factor(const FRAME_INFO *frame_info, - const FIRSTPASS_STATS *frame) { +static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) { const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; - double sr_decay = get_sr_decay_rate(frame_info, frame); + double sr_decay = get_sr_decay_rate(frame); return AOMMIN(sr_decay, zero_motion_pct); } -#define ZM_POWER_FACTOR 0.75 +#define DEFAULT_ZM_FACTOR 0.5 +static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) { + const double sr_decay_rate = get_sr_decay_rate(frame_stats); + double zero_motion_factor = + DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion); -static double get_prediction_decay_rate(const FRAME_INFO *frame_info, - const FIRSTPASS_STATS *next_frame) { - const double sr_decay_rate = get_sr_decay_rate(frame_info, next_frame); - const double zero_motion_factor = - (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion), - ZM_POWER_FACTOR)); + // Clamp value to range 0.0 to 1.0 + // This should happen anyway if input values are sensibly clamped but checked + // here just in case. + if (zero_motion_factor > 1.0) + zero_motion_factor = 1.0; + else if (zero_motion_factor < 0.0) + zero_motion_factor = 0.0; return AOMMAX(zero_motion_factor, (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); @@ -449,7 +462,6 @@ static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats, } static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats, - const FRAME_INFO *frame_info, const int flash_detected, const int frames_since_key, const int cur_idx, @@ -470,16 +482,15 @@ static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats, // Accumulate the effect of prediction quality decay if (!flash_detected) { gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate; - gf_stats->loop_decay_rate = get_prediction_decay_rate(frame_info, stats); + gf_stats->loop_decay_rate = get_prediction_decay_rate(stats); gf_stats->decay_accumulator = gf_stats->decay_accumulator * gf_stats->loop_decay_rate; // Monitor for static sections. if ((frames_since_key + cur_idx - 1) > 1) { - gf_stats->zero_motion_accumulator = - AOMMIN(gf_stats->zero_motion_accumulator, - get_zero_motion_factor(frame_info, stats)); + gf_stats->zero_motion_accumulator = AOMMIN( + gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats)); } } } @@ -618,8 +629,8 @@ static double calc_kf_frame_boost(const RATE_CONTROL *rc, return AOMMIN(frame_boost, max_boost * boost_q_correction); } -static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost, - int frames_to_project, +static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc, + int gfu_boost, int frames_to_project, int num_stats_used_for_gfu_boost) { /* * If frames_to_project is equal to num_stats_used_for_gfu_boost, @@ -629,7 +640,7 @@ static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost, */ if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost; - double min_boost_factor = sqrt(rc->baseline_gf_interval); + double min_boost_factor = sqrt(p_rc->baseline_gf_interval); // Get the current tpl factor (number of frames = frames_to_project). double tpl_factor = av1_get_gfu_boost_projection_factor( min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project); @@ -642,11 +653,13 @@ static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost, } #define GF_MAX_BOOST 90.0 +#define GF_MIN_BOOST 50 #define MIN_DECAY_FACTOR 0.01 -int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc, +int av1_calc_arf_boost(const TWO_PASS *twopass, + const PRIMARY_RATE_CONTROL *p_rc, const RATE_CONTROL *rc, FRAME_INFO *frame_info, int offset, int f_frames, int b_frames, int *num_fpstats_used, - int *num_fpstats_required) { + int *num_fpstats_required, int project_gfu_boost) { int i; GF_GROUP_STATS gf_stats; init_gf_stats(&gf_stats); @@ -670,8 +683,7 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc, // Accumulate the effect of prediction quality decay. if (!flash_detected) { - gf_stats.decay_accumulator *= - get_prediction_decay_rate(frame_info, this_frame); + gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame); gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : gf_stats.decay_accumulator; @@ -704,8 +716,7 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc, // Cumulative effect of prediction quality decay. if (!flash_detected) { - gf_stats.decay_accumulator *= - get_prediction_decay_rate(frame_info, this_frame); + gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame); gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : gf_stats.decay_accumulator; @@ -719,16 +730,16 @@ int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc, } arf_boost += (int)boost_score; - if (num_fpstats_required) { + if (project_gfu_boost) { + assert(num_fpstats_required != NULL); + assert(num_fpstats_used != NULL); *num_fpstats_required = f_frames + b_frames; - if (num_fpstats_used) { - arf_boost = get_projected_gfu_boost(rc, arf_boost, *num_fpstats_required, - *num_fpstats_used); - } + arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required, + *num_fpstats_used); } - if (arf_boost < ((b_frames + f_frames) * 50)) - arf_boost = ((b_frames + f_frames) * 50); + if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST)) + arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST); return arf_boost; } @@ -767,7 +778,8 @@ static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, double gf_group_err) { const RATE_CONTROL *const rc = &cpi->rc; - const TWO_PASS *const twopass = &cpi->twopass; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const TWO_PASS *const twopass = &cpi->ppi->twopass; const int max_bits = frame_max_bits(rc, &cpi->oxcf); int64_t total_group_bits; @@ -787,8 +799,8 @@ static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, : total_group_bits; // Clip based on user supplied data rate variability limit. - if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) - total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval) + total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval; return total_group_bits; } @@ -834,7 +846,8 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi, int64_t group_bits, int frame_type) { const AV1_COMMON *const cm = &cpi->common; - const SequenceHeader *const seq_params = &cm->seq_params; + const SequenceHeader *const seq_params = cm->seq_params; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const int temporal_layer_id = cm->temporal_layer_id; const int spatial_layer_id = cm->spatial_layer_id; for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1; @@ -845,7 +858,7 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi, } const AV1_LEVEL target_level = - cpi->level_params.target_seq_level_idx[index]; + cpi->ppi->level_params.target_seq_level_idx[index]; if (target_level >= SEQ_LEVELS) continue; assert(is_valid_seq_level_idx(target_level)); @@ -859,18 +872,20 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi, const int level_enforced_max_kf_bits = target_bits_per_frame * 8; if (bits_assigned > level_enforced_max_kf_bits) { const int frames = rc->frames_to_key - 1; - rc->kf_boost = calculate_boost_factor( + p_rc->kf_boost = calculate_boost_factor( frames, level_enforced_max_kf_bits, group_bits); - bits_assigned = calculate_boost_bits(frames, rc->kf_boost, group_bits); + bits_assigned = + calculate_boost_bits(frames, p_rc->kf_boost, group_bits); } } else if (frame_type == 1) { // Maximum bits for arf is 4 times the target_bits_per_frame. const int level_enforced_max_arf_bits = target_bits_per_frame * 4; if (bits_assigned > level_enforced_max_arf_bits) { - rc->gfu_boost = calculate_boost_factor( - rc->baseline_gf_interval, level_enforced_max_arf_bits, group_bits); - bits_assigned = calculate_boost_bits(rc->baseline_gf_interval, - rc->gfu_boost, group_bits); + p_rc->gfu_boost = + calculate_boost_factor(p_rc->baseline_gf_interval, + level_enforced_max_arf_bits, group_bits); + bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval, + p_rc->gfu_boost, group_bits); } } else { assert(0); @@ -883,7 +898,9 @@ static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi, // Allocate bits to each frame in a GF / ARF group double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0, 0.70, 0.55, 0.60, 0.60, 1.0, 1.0 }; -static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc, +static void allocate_gf_group_bits(GF_GROUP *gf_group, + PRIMARY_RATE_CONTROL *const p_rc, + RATE_CONTROL *const rc, int64_t gf_group_bits, int gf_arf_bits, int key_frame, int use_arf) { int64_t total_group_bits = gf_group_bits; @@ -900,7 +917,7 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc, if (use_arf) total_group_bits -= gf_arf_bits; int num_frames = - AOMMAX(1, rc->baseline_gf_interval - (rc->frames_since_key == 0)); + AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0)); base_frame_bits = (int)(total_group_bits / num_frames); // Check the number of frames in each layer in case we have a @@ -943,7 +960,8 @@ static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc, // in the next GOP. For GF group, next GOP will overwrite the rate allocation. // Setting this frame to use 0 bit (of out the current GOP budget) will // simplify logics in reference frame management. - gf_group->bit_allocation[gf_group_size] = 0; + if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH) + gf_group->bit_allocation[gf_group_size] = 0; } // Returns true if KF group and GF group both are almost completely static. @@ -967,7 +985,7 @@ static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start, int active_min_gf_interval, GF_GROUP_STATS *gf_stats) { RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; // Motion breakout threshold for loop below depends on image size. const double mv_ratio_accumulator_thresh = @@ -997,12 +1015,71 @@ static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start, // so we can continue for more frames. if (((frame_index - cur_start) >= active_max_gf_interval + 1) && !is_almost_static(gf_stats->zero_motion_accumulator, - twopass->kf_zeromotion_pct, cpi->lap_enabled)) { + twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) { return 1; } return 0; } +static int is_shorter_gf_interval_better(AV1_COMP *cpi, + EncodeFrameParams *frame_params, + const EncodeFrameInput *frame_input) { + RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method; + int shorten_gf_interval; + + av1_tpl_preload_rc_estimate(cpi, frame_params); + + if (gop_length_decision_method == 2) { + // GF group length is decided based on GF boost and tpl stats of ARFs from + // base layer, (base+1) layer. + shorten_gf_interval = + (p_rc->gfu_boost < + p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) && + !av1_tpl_setup_stats(cpi, 3, frame_params, frame_input); + } else { + int do_complete_tpl = 1; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; + int is_temporal_filter_enabled = + (rc->frames_since_key > 0 && gf_group->arf_index > -1); + + if (is_temporal_filter_enabled) { + int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index]; + FRAME_UPDATE_TYPE arf_update_type = + gf_group->update_type[gf_group->arf_index]; + int is_forward_keyframe = 0; + av1_temporal_filter(cpi, arf_src_index, arf_update_type, + is_forward_keyframe, NULL); + aom_extend_frame_borders(&cpi->ppi->alt_ref_buffer, + av1_num_planes(&cpi->common)); + } + + if (gop_length_decision_method == 1) { + // Check if tpl stats of ARFs from base layer, (base+1) layer, + // (base+2) layer can decide the GF group length. + int gop_length_eval = + av1_tpl_setup_stats(cpi, 2, frame_params, frame_input); + + if (gop_length_eval != 2) { + do_complete_tpl = 0; + shorten_gf_interval = !gop_length_eval; + } + } + + if (do_complete_tpl) { + // Decide GF group length based on complete tpl stats. + shorten_gf_interval = + !av1_tpl_setup_stats(cpi, 1, frame_params, frame_input); + // Tpl stats is reused when the ARF is temporally filtered and GF + // interval is not shortened. + if (is_temporal_filter_enabled && !shorten_gf_interval) + cpi->skip_tpl_setup_stats = 1; + } + } + return shorten_gf_interval; +} + #define MIN_FWD_KF_INTERVAL 8 #define MIN_SHRINK_LEN 6 // the minimum length of gf if we are shrinking #define SMOOTH_FILT_LEN 7 @@ -1014,17 +1091,16 @@ const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383, 0.242, 0.061, 0.006 }; // Smooth filter intra_error and coded_error in firstpass stats. -// If ignore[i]==1, the ith element should not be used in the filtering. -static void smooth_filter_stats(const FIRSTPASS_STATS *stats, const int *ignore, - int start_idx, int last_idx, - double *filt_intra_err, +// If stats[i].is_flash==1, the ith element should not be used in the filtering. +static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx, + int last_idx, double *filt_intra_err, double *filt_coded_err) { int i, j; for (i = start_idx; i <= last_idx; i++) { double total_wt = 0; for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx); - if (ignore[idx]) continue; + if (stats[idx].is_flash) continue; filt_intra_err[i] += smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error; @@ -1041,7 +1117,7 @@ static void smooth_filter_stats(const FIRSTPASS_STATS *stats, const int *ignore, for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) { int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx); // Coded error involves idx and idx - 1. - if (ignore[idx] || (idx > 0 && ignore[idx - 1])) continue; + if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue; filt_coded_err[i] += smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error; @@ -1070,7 +1146,7 @@ static void get_gradient(const double *values, int start, int last, } static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start, - int first, int last, int *ignore) { + int first, int last) { // Identify unstable areas caused by scenecuts. // Find the max and 2nd max coded error, and the average of the rest frames. // If there is only one frame that yields a huge coded error, it is likely a @@ -1081,14 +1157,16 @@ static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start, if (last - first == 0) return -1; for (int i = first; i <= last; i++) { - if (ignore[i] || (i > 0 && ignore[i - 1])) continue; + if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash)) + continue; double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01); this_ratio = stats_start[i].coded_error / temp_intra; // find the avg ratio in the preceding neighborhood max_prev_ratio = 0; max_prev_coded = 0; for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) { - if (ignore[j] || (j > 0 && ignore[j - 1])) continue; + if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash)) + continue; temp_intra = AOMMAX(stats_start[j].intra_error, 0.01); double temp_ratio = stats_start[j].coded_error / temp_intra; if (temp_ratio > max_prev_ratio) { @@ -1102,7 +1180,8 @@ static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start, max_next_ratio = 0; max_next_coded = 0; for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) { - if (ignore[j] || (j > 0 && ignore[j - 1])) continue; + if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash)) + continue; temp_intra = AOMMAX(stats_start[j].intra_error, 0.01); double temp_ratio = stats_start[j].coded_error / temp_intra; if (temp_ratio > max_next_ratio) { @@ -1135,19 +1214,6 @@ static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start, return -1; } -static void mark_flashes(const FIRSTPASS_STATS *stats, int start_idx, - int last_idx, int *is_flash) { - int i; - for (i = start_idx; i < last_idx; i++) { - if (stats[i + 1].pcnt_second_ref > stats[i + 1].pcnt_inter && - stats[i + 1].pcnt_second_ref >= 0.5) { - // this is a new flash frame - is_flash[i] = 1; - continue; - } - } -} - // Remove the region with index next_region. // parameter merge: 0: merge with previous; 1: merge with next; 2: // merge with both, take type from previous if possible @@ -1220,46 +1286,10 @@ static void insert_region(int start, int last, REGION_TYPES type, *cur_region_idx = k; } -// Estimate the noise variance of each frame from the first pass stats -static void estimate_region_noise(const FIRSTPASS_STATS *stats, - const int *is_flash, REGIONS *region) { - double C1, C2, C3, noise; - int count = 0; - region->avg_noise_var = -1; - for (int i = region->start + 2; i <= region->last; i++) { - if (is_flash[i] || is_flash[i - 1] || is_flash[i - 2]) continue; - - C1 = stats[i - 1].intra_error * - (stats[i].intra_error - stats[i].coded_error); - C2 = stats[i - 2].intra_error * - (stats[i - 1].intra_error - stats[i - 1].coded_error); - C3 = stats[i - 2].intra_error * - (stats[i].intra_error - stats[i].sr_coded_error); - if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue; - C1 = sqrt(C1); - C2 = sqrt(C2); - C3 = sqrt(C3); - - noise = stats[i - 1].intra_error - C1 * C2 / C3; - noise = AOMMAX(noise, 0.01); - region->avg_noise_var = (region->avg_noise_var == -1) - ? noise - : AOMMIN(noise, region->avg_noise_var); - count++; - } - if (count == 0) { - region->avg_noise_var = 0; - } -} - -// Analyze the corrrelation coefficient of each frame with its previous frame in -// a region. Also get the average of stats inside a region. -// Before calling this function, the region's noise variance is needed. -static void analyze_region(const FIRSTPASS_STATS *stats, int region_idx, - REGIONS *regions, double *coeff) { - double cor_coeff; - - int i, k = region_idx; +// Get the average of stats inside a region. +static void analyze_region(const FIRSTPASS_STATS *stats, int k, + REGIONS *regions) { + int i; regions[k].avg_cor_coeff = 0; regions[k].avg_sr_fr_ratio = 0; regions[k].avg_intra_err = 0; @@ -1268,12 +1298,6 @@ static void analyze_region(const FIRSTPASS_STATS *stats, int region_idx, int check_first_sr = (k != 0); for (i = regions[k].start; i <= regions[k].last; i++) { - double C = sqrt(AOMMAX(stats[i - 1].intra_error * - (stats[i].intra_error - stats[i].coded_error), - 0.001)); - cor_coeff = - C / AOMMAX(stats[i - 1].intra_error - regions[k].avg_noise_var, 0.001); - if (i > regions[k].start || check_first_sr) { double num_frames = (double)(regions[k].last - regions[k].start + check_first_sr); @@ -1289,85 +1313,27 @@ static void analyze_region(const FIRSTPASS_STATS *stats, int region_idx, regions[k].avg_coded_err += stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1); - coeff[i] = - cor_coeff * - sqrt( - AOMMAX(stats[i - 1].intra_error - regions[k].avg_noise_var, 0.001) / - AOMMAX(stats[i].intra_error - regions[k].avg_noise_var, 0.001)); - // clip correlation coefficient. - coeff[i] = AOMMIN(AOMMAX(coeff[i], 0), 1); - regions[k].avg_cor_coeff += - coeff[i] / (double)(regions[k].last - regions[k].start + 1); + AOMMAX(stats[i].cor_coeff, 0.001) / + (double)(regions[k].last - regions[k].start + 1); + regions[k].avg_noise_var += + AOMMAX(stats[i].noise_var, 0.001) / + (double)(regions[k].last - regions[k].start + 1); } } -// Calculate the regions stats of every region. Uses the stable regions to -// estimate noise variance of other regions. Then call analyze_region for each. -static void get_region_stats(const FIRSTPASS_STATS *stats, const int *is_flash, - REGIONS *regions, double *coeff, int num_regions) { - int k, count_stable = 0; - // Analyze stable regions. - for (k = 0; k < num_regions; k++) { - if (regions[k].type == STABLE_REGION) { - estimate_region_noise(stats, is_flash, regions + k); - analyze_region(stats, k, regions, coeff); - count_stable++; - } - } - - if (count_stable == 0) { - // no stable region, just use the lowest noise variance estimated. - double lowest_noise = -1; - for (k = 0; k < num_regions; k++) { - if (regions[k].type == SCENECUT_REGION) continue; - estimate_region_noise(stats, is_flash, regions + k); - if (regions[k].avg_noise_var < 0.01) continue; - if (lowest_noise < 0 || lowest_noise > regions[k].avg_noise_var) { - lowest_noise = regions[k].avg_noise_var; - } - } - lowest_noise = AOMMAX(lowest_noise, 0); - for (k = 0; k < num_regions; k++) { - regions[k].avg_noise_var = lowest_noise; - analyze_region(stats, k, regions, coeff); - } - return; - } - - // Analyze other regions - for (k = 0; k < num_regions; k++) { - if (regions[k].type != STABLE_REGION) { - // use the average of the nearest previous and next stable regions - int count = 0; - regions[k].avg_noise_var = 0; - for (int r = k - 1; r >= 0; r--) { - if (regions[r].type == STABLE_REGION) { - count++; - regions[k].avg_noise_var += regions[r].avg_noise_var; - break; - } - } - for (int r = k + 1; r < num_regions; r++) { - if (regions[r].type == STABLE_REGION) { - count++; - regions[k].avg_noise_var += regions[r].avg_noise_var; - break; - } - } - if (count) { - regions[k].avg_noise_var /= (double)count; - } - analyze_region(stats, k, regions, coeff); - } +// Calculate the regions stats of every region. +static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions, + int num_regions) { + for (int k = 0; k < num_regions; k++) { + analyze_region(stats, k, regions); } } // Find tentative stable regions static int find_stable_regions(const FIRSTPASS_STATS *stats, - const double *grad_coded, const int *ignore, - int this_start, int this_last, - REGIONS *regions) { + const double *grad_coded, int this_start, + int this_last, REGIONS *regions) { int i, j, k = 0; regions[k].start = this_start; for (i = this_start; i <= this_last; i++) { @@ -1377,7 +1343,7 @@ static int find_stable_regions(const FIRSTPASS_STATS *stats, int count = 0; for (j = -HALF_WIN; j <= HALF_WIN; j++) { int idx = AOMMIN(AOMMAX(i + j, this_start), this_last); - if (ignore[idx] || (idx > 0 && ignore[idx - 1])) continue; + if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue; mean_intra += stats[idx].intra_error; var_intra += stats[idx].intra_error * stats[idx].intra_error; mean_coded += stats[idx].coded_error; @@ -1451,15 +1417,13 @@ static void remove_short_regions(REGIONS *regions, int *num_regions, } static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, - const int *is_flash, - const double *grad, REGIONS *regions, - double *coeff, int *num_regions) { + REGIONS *regions, int *num_regions) { int i, j, k; // Remove regions that are too short. Likely noise. remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN); remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); - get_region_stats(stats, is_flash, regions, coeff, *num_regions); + get_region_stats(stats, regions, *num_regions); // Adjust region boundaries. The thresholds are empirically obtained, but // overall the performance is not very sensitive to small changes to them. @@ -1469,34 +1433,24 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, // Adjust previous boundary. // First find the average intra/coded error in the previous // neighborhood. - double avg_intra_err = 0, avg_coded_err = 0, avg_coeff = 0; - int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1, - regions[k - 1].start + 1); - int lasti = regions[k - 1].last; + double avg_intra_err = 0; + const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1, + regions[k - 1].start + 1); + const int lasti = regions[k - 1].last; int counti = 0; for (i = starti; i <= lasti; i++) { avg_intra_err += stats[i].intra_error; - avg_coded_err += stats[i].coded_error; - avg_coeff += coeff[i]; counti++; } if (counti > 0) { avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001); - avg_coded_err /= AOMMAX(avg_coded_err / (double)counti, 0.001); - avg_coeff /= AOMMIN(avg_intra_err / (double)counti, 0.99999); int count_coded = 0, count_grad = 0; for (j = lasti + 1; j <= regions[k].last; j++) { - int intra_close = + const int intra_close = fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1; - int coded_close = - fabs(stats[j].coded_error - avg_coded_err) / avg_coded_err < 0.15; - int grad_small = fabs(grad[j]) / avg_coded_err < 0.05; - int coded_small = stats[j].coded_error / avg_intra_err < 0.03; - int coeff_close = - (1 - coeff[j]) / (1 - avg_coeff) < 1.5 || coeff[j] > 0.995; - if (!coeff_close || (!coded_close && !coded_small)) count_coded--; - if (!grad_small && !coded_small) count_grad--; - + const int coded_small = stats[j].coded_error / avg_intra_err < 0.1; + const int coeff_close = stats[j].cor_coeff > 0.995; + if (!coeff_close || !coded_small) count_coded--; if (intra_close && count_coded >= 0 && count_grad >= 0) { // this frame probably belongs to the previous stable region regions[k - 1].last = j; @@ -1510,35 +1464,26 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, if (k < *num_regions - 1) { // Adjust next boundary. // First find the average intra/coded error in the next neighborhood. - double avg_intra_err = 0, avg_coded_err = 0, avg_coeff = 0; - int starti = regions[k + 1].start; - int lasti = AOMMIN(regions[k + 1].last - 1, - regions[k + 1].start + WINDOW_SIZE - 1); + double avg_intra_err = 0; + const int starti = regions[k + 1].start; + const int lasti = AOMMIN(regions[k + 1].last - 1, + regions[k + 1].start + WINDOW_SIZE - 1); int counti = 0; for (i = starti; i <= lasti; i++) { avg_intra_err += stats[i].intra_error; - avg_coded_err += stats[i + 1].coded_error; - avg_coeff += coeff[i]; counti++; } if (counti > 0) { avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001); - avg_coded_err /= AOMMAX(avg_coded_err / (double)counti, 0.001); - avg_coeff /= AOMMIN(avg_intra_err / (double)counti, 0.99999); // At the boundary, coded error is large, but still the frame is stable int count_coded = 1, count_grad = 1; for (j = starti - 1; j >= regions[k].start; j--) { - int intra_close = + const int intra_close = fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1; - int coded_close = - fabs(stats[j + 1].coded_error - avg_coded_err) / avg_coded_err < - 0.15; - int grad_small = fabs(grad[j + 1]) / avg_coded_err < 0.05; - int coded_small = stats[j + 1].coded_error / avg_intra_err < 0.03; - int coeff_close = - (1 - coeff[j + 1]) / (1 - avg_coeff) < 1.5 || coeff[j] > 0.995; - if (!coeff_close || (!coded_close && !coded_small)) count_coded--; - if (!grad_small && !coded_small) count_grad--; + const int coded_small = + stats[j + 1].coded_error / avg_intra_err < 0.1; + const int coeff_close = stats[j].cor_coeff > 0.995; + if (!coeff_close || !coded_small) count_coded--; if (intra_close && count_coded >= 0 && count_grad >= 0) { // this frame probably belongs to the next stable region regions[k + 1].start = j; @@ -1553,7 +1498,7 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, cleanup_regions(regions, num_regions); remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN); - get_region_stats(stats, is_flash, regions, coeff, *num_regions); + get_region_stats(stats, regions, *num_regions); // If a stable regions has higher error than neighboring high var regions, // or if the stable region has a lower average correlation, @@ -1561,25 +1506,31 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, k = 0; while (k < *num_regions && (*num_regions) > 1) { if (regions[k].type == STABLE_REGION && + (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE && ((k > 0 && // previous regions - (regions[k].avg_coded_err > regions[k - 1].avg_coded_err || - regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff)) && + (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 || + regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) && (k < *num_regions - 1 && // next region - (regions[k].avg_coded_err > regions[k + 1].avg_coded_err || - regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff)))) { + (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 || + regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) { // merge current region with the previous and next regions remove_region(2, regions, num_regions, &k); - analyze_region(stats, k - 1, regions, coeff); + analyze_region(stats, k - 1, regions); } else if (regions[k].type == HIGH_VAR_REGION && + (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE && ((k > 0 && // previous regions - (regions[k].avg_coded_err < regions[k - 1].avg_coded_err || - regions[k].avg_cor_coeff > regions[k - 1].avg_cor_coeff)) && + (regions[k].avg_coded_err < + regions[k - 1].avg_coded_err * 0.99 || + regions[k].avg_cor_coeff > + regions[k - 1].avg_cor_coeff * 1.001)) && (k < *num_regions - 1 && // next region - (regions[k].avg_coded_err < regions[k + 1].avg_coded_err || - regions[k].avg_cor_coeff > regions[k + 1].avg_cor_coeff)))) { + (regions[k].avg_coded_err < + regions[k + 1].avg_coded_err * 0.99 || + regions[k].avg_cor_coeff > + regions[k + 1].avg_cor_coeff * 1.001)))) { // merge current region with the previous and next regions remove_region(2, regions, num_regions, &k); - analyze_region(stats, k - 1, regions, coeff); + analyze_region(stats, k - 1, regions); } else { k++; } @@ -1591,8 +1542,7 @@ static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats, // Identify blending regions. static void find_blending_regions(const FIRSTPASS_STATS *stats, - const int *is_flash, REGIONS *regions, - int *num_regions, double *coeff) { + REGIONS *regions, int *num_regions) { int i, k = 0; // Blending regions will have large content change, therefore will have a // large consistent change in intra error. @@ -1607,7 +1557,8 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats, int start = 0, last; for (i = regions[k].start; i <= regions[k].last; i++) { // First mark the regions that has consistent large change of intra error. - if (is_flash[i] || (i > 0 && is_flash[i - 1])) continue; + if (k == 0 && i == regions[k].start) continue; + if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue; double grad = stats[i].intra_error - stats[i - 1].intra_error; int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05; int this_dir = 0; @@ -1622,7 +1573,11 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats, insert_region(start, last, BLENDING_REGION, regions, num_regions, &k); } dir = this_dir; - start = i; + if (k == 0 && i == regions[k].start + 1) { + start = i - 1; + } else { + start = i; + } } if (dir != 0) { last = regions[k].last; @@ -1633,14 +1588,14 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats, // If the blending region has very low correlation, mark it as high variance // since we probably cannot benefit from it anyways. - get_region_stats(stats, is_flash, regions, coeff, *num_regions); + get_region_stats(stats, regions, *num_regions); for (k = 0; k < *num_regions; k++) { if (regions[k].type != BLENDING_REGION) continue; if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 || count_stable == 0) regions[k].type = HIGH_VAR_REGION; } - get_region_stats(stats, is_flash, regions, coeff, *num_regions); + get_region_stats(stats, regions, *num_regions); // It is possible for blending to result in a "dip" in intra error (first // decrease then increase). Therefore we need to find the dip and combine the @@ -1669,7 +1624,7 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats, if (regions[k].avg_sr_fr_ratio > ratio_thres) { regions[k].type = BLENDING_REGION; remove_region(2, regions, num_regions, &k); - analyze_region(stats, k - 1, regions, coeff); + analyze_region(stats, k - 1, regions); continue; } } @@ -1727,7 +1682,7 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats, if (to_merge) { remove_region(0, regions, num_regions, &k); - analyze_region(stats, k - 1, regions, coeff); + analyze_region(stats, k - 1, regions); continue; } else { // These are possibly two separate blending regions. Mark the boundary @@ -1735,9 +1690,9 @@ static void find_blending_regions(const FIRSTPASS_STATS *stats, int prev_k = k - 1; insert_region(regions[prev_k].last, regions[prev_k].last, HIGH_VAR_REGION, regions, num_regions, &prev_k); - analyze_region(stats, prev_k, regions, coeff); + analyze_region(stats, prev_k, regions); k = prev_k + 1; - analyze_region(stats, k, regions, coeff); + analyze_region(stats, k, regions); } } k++; @@ -1793,16 +1748,13 @@ static void cleanup_blendings(REGIONS *regions, int *num_regions) { // pointing to. static void identify_regions(const FIRSTPASS_STATS *const stats_start, int total_frames, int offset, REGIONS *regions, - int *total_regions, double *cor_coeff) { + int *total_regions) { int k; if (total_frames <= 1) return; - double *coeff = cor_coeff + offset; - // store the initial decisions REGIONS temp_regions[MAX_FIRSTPASS_ANALYSIS_FRAMES]; av1_zero_array(temp_regions, MAX_FIRSTPASS_ANALYSIS_FRAMES); - int is_flash[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 }; // buffers for filtered stats double filt_intra_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 }; double filt_coded_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 }; @@ -1810,32 +1762,28 @@ static void identify_regions(const FIRSTPASS_STATS *const stats_start, int cur_region = 0, this_start = 0, this_last; - // find possible flash frames - mark_flashes(stats_start, 0, total_frames - 1, is_flash); - - // first get the obvious scenecuts int next_scenecut = -1; - do { + // first get the obvious scenecuts next_scenecut = - find_next_scenecut(stats_start, this_start, total_frames - 1, is_flash); + find_next_scenecut(stats_start, this_start, total_frames - 1); this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1; + // low-pass filter the needed stats - smooth_filter_stats(stats_start, is_flash, this_start, this_last, - filt_intra_err, filt_coded_err); + smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err, + filt_coded_err); get_gradient(filt_coded_err, this_start, this_last, grad_coded); // find tentative stable regions and unstable regions - int num_regions = find_stable_regions(stats_start, grad_coded, is_flash, - this_start, this_last, temp_regions); - adjust_unstable_region_bounds(stats_start, is_flash, grad_coded, - temp_regions, coeff, &num_regions); + int num_regions = find_stable_regions(stats_start, grad_coded, this_start, + this_last, temp_regions); - get_region_stats(stats_start, is_flash, temp_regions, coeff, num_regions); + adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions); + + get_region_stats(stats_start, temp_regions, num_regions); // Try to identify blending regions in the unstable regions - find_blending_regions(stats_start, is_flash, temp_regions, &num_regions, - coeff); + find_blending_regions(stats_start, temp_regions, &num_regions); cleanup_blendings(temp_regions, &num_regions); // The flash points should all be considered high variance points @@ -1848,7 +1796,7 @@ static void identify_regions(const FIRSTPASS_STATS *const stats_start, int start = temp_regions[k].start; int last = temp_regions[k].last; for (int i = start; i <= last; i++) { - if (is_flash[i]) { + if (stats_start[i].is_flash) { insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k); } } @@ -1858,6 +1806,11 @@ static void identify_regions(const FIRSTPASS_STATS *const stats_start, // copy the regions in the scenecut group for (k = 0; k < num_regions; k++) { + if (temp_regions[k].last < temp_regions[k].start && + k == num_regions - 1) { + num_regions--; + break; + } regions[k + cur_region] = temp_regions[k]; } cur_region += num_regions; @@ -1874,17 +1827,21 @@ static void identify_regions(const FIRSTPASS_STATS *const stats_start, } while (next_scenecut >= 0); *total_regions = cur_region; - get_region_stats(stats_start, is_flash, regions, coeff, *total_regions); + get_region_stats(stats_start, regions, *total_regions); for (k = 0; k < *total_regions; k++) { // If scenecuts are very minor, mark them as high variance. - if (regions[k].type != SCENECUT_REGION || regions[k].avg_cor_coeff < 0.8) { + if (regions[k].type != SCENECUT_REGION || + regions[k].avg_cor_coeff * + (1 - stats_start[regions[k].start].noise_var / + regions[k].avg_intra_err) < + 0.8) { continue; } regions[k].type = HIGH_VAR_REGION; } cleanup_regions(regions, total_regions); - get_region_stats(stats_start, is_flash, regions, coeff, *total_regions); + get_region_stats(stats_start, regions, *total_regions); for (k = 0; k < *total_regions; k++) { regions[k].start += offset; @@ -1911,16 +1868,17 @@ static int find_regions_index(const REGIONS *regions, int num_regions, * \param[in] max_gop_length Maximum length of the GF group * \param[in] max_intervals Maximum number of intervals to decide * - * \return Nothing is returned. Instead, cpi->rc.gf_intervals is + * \return Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is * changed to store the decided GF group lengths. */ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, int max_intervals) { RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS next_frame; const FIRSTPASS_STATS *const start_pos = twopass->stats_in; - FRAME_INFO *frame_info = &cpi->frame_info; + const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0); int i; int flash_detected; @@ -1930,9 +1888,9 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, if (has_no_stats_stage(cpi)) { for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) { - rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length); + p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length); } - rc->cur_gf_index = 0; + p_rc->cur_gf_index = 0; rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS; return; } @@ -1944,17 +1902,17 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval); i = (rc->frames_since_key == 0); - max_intervals = cpi->lap_enabled ? 1 : max_intervals; + max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals; int count_cuts = 1; // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF. - int cur_start = -1 + !cpi->gf_state.arf_gf_boost_lst, cur_last; + int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last; int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 }; int cut_here; GF_GROUP_STATS gf_stats; init_gf_stats(&gf_stats); while (count_cuts < max_intervals + 1) { // reaches next key frame, break here - if (i >= rc->frames_to_key + rc->next_is_fwd_key) { + if (i >= rc->frames_to_key + p_rc->next_is_fwd_key) { cut_here = 2; } else if (i - cur_start >= rc->static_scene_max_gf_interval) { // reached maximum len, but nothing special yet (almost static) @@ -1969,7 +1927,7 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, flash_detected = detect_flash(twopass, 0); // TODO(bohanli): remove redundant accumulations here, or unify // this and the ones in define_gf_group - accumulate_next_frame_stats(&next_frame, frame_info, flash_detected, + accumulate_next_frame_stats(&next_frame, flash_detected, rc->frames_since_key, i, &gf_stats); cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected, @@ -1981,10 +1939,10 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, int ori_last = cur_last; // The region frame idx does not start from the same frame as cur_start // and cur_last. Need to offset them. - int offset = rc->frames_since_key - rc->regions_offset; - REGIONS *regions = rc->regions; - int num_regions = rc->num_regions; - if (cpi->oxcf.kf_cfg.fwd_kf_enabled && rc->next_is_fwd_key) { + int offset = rc->frames_since_key - p_rc->regions_offset; + REGIONS *regions = p_rc->regions; + int num_regions = p_rc->num_regions; + if (cpi->oxcf.kf_cfg.fwd_kf_enabled && p_rc->next_is_fwd_key) { const int frames_left = rc->frames_to_key - i; const int min_int = AOMMIN(MIN_FWD_KF_INTERVAL, active_min_gf_interval); if (frames_left < min_int && frames_left > 0) { @@ -2021,7 +1979,11 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, // If we have a scenecut, then stop at it. // TODO(bohanli): add logic here to stop before the scenecut and for // the next gop start from the scenecut with GF - int is_minor_sc = (regions[scenecut_idx].avg_cor_coeff > 0.6); + int is_minor_sc = + (regions[scenecut_idx].avg_cor_coeff * + (1 - stats[regions[scenecut_idx].start - offset].noise_var / + regions[scenecut_idx].avg_intra_err) > + 0.6); cur_last = regions[scenecut_idx].last - offset - !is_minor_sc; } else { int is_last_analysed = (k_last == num_regions - 1) && @@ -2032,45 +1994,91 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, // if we are very close to the end, then do not shrink since it may // introduce intervals that are too short if (!(is_last_analysed && not_enough_regions)) { - int found = 0; - // first try to end at a stable area - for (int j = cur_last; j >= cur_start + min_shrink_int; j--) { - if (regions[find_regions_index(regions, num_regions, j + offset)] - .type == STABLE_REGION) { - cur_last = j; - found = 1; - break; - } + const double arf_length_factor = 0.1; + double best_score = 0; + int best_j = -1; + const int first_frame = regions[0].start - offset; + const int last_frame = regions[num_regions - 1].last - offset; + // score of how much the arf helps the whole GOP + double base_score = 0.0; + // Accumulate base_score in + for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) { + if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break; + base_score = (base_score + 1.0) * stats[j].cor_coeff; } - if (!found) { - // Could not find stable point, - // try to find an OK point (high correlation, not blending) - for (int j = cur_last; j >= cur_start + min_shrink_int; j--) { - REGIONS *cur_region = - regions + - find_regions_index(regions, num_regions, j + offset); - double avg_coeff = cur_region->avg_cor_coeff; - if (rc->cor_coeff[j + offset] > avg_coeff && - cur_region->type != BLENDING_REGION) { - cur_last = j; - found = 1; + int met_blending = 0; // Whether we have met blending areas before + int last_blending = 0; // Whether the previous frame if blending + for (int j = cur_start + min_shrink_int; j <= cur_last; j++) { + if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break; + base_score = (base_score + 1.0) * stats[j].cor_coeff; + int this_reg = + find_regions_index(regions, num_regions, j + offset); + if (this_reg < 0) continue; + // A GOP should include at most 1 blending region. + if (regions[this_reg].type == BLENDING_REGION) { + last_blending = 1; + if (met_blending) { break; + } else { + base_score = 0; + continue; } + } else { + if (last_blending) met_blending = 1; + last_blending = 0; + } + + // Add the factor of how good the neighborhood is for this + // candidate arf. + double this_score = arf_length_factor * base_score; + double temp_accu_coeff = 1.0; + // following frames + int count_f = 0; + for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) { + if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break; + temp_accu_coeff *= stats[n].cor_coeff; + this_score += + temp_accu_coeff * + (1 - stats[n].noise_var / + AOMMAX(regions[this_reg].avg_intra_err, 0.001)); + count_f++; + } + // preceding frames + temp_accu_coeff = 1.0; + for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) { + if (stats + n < twopass->stats_buf_ctx->stats_in_start) break; + temp_accu_coeff *= stats[n].cor_coeff; + this_score += + temp_accu_coeff * + (1 - stats[n].noise_var / + AOMMAX(regions[this_reg].avg_intra_err, 0.001)); + } + + if (this_score > best_score) { + best_score = this_score; + best_j = j; } } - if (!found) { - // Could not find a better point, - // try not to cut in blending areas - for (int j = cur_last; j >= cur_start + min_shrink_int; j--) { - REGIONS *cur_region = - regions + - find_regions_index(regions, num_regions, j + offset); - if (cur_region->type != BLENDING_REGION) { - cur_last = j; - break; + + // For blending areas, move one more frame in case we missed the + // first blending frame. + int best_reg = + find_regions_index(regions, num_regions, best_j + offset); + if (best_reg < num_regions - 1 && best_reg > 0) { + if (regions[best_reg - 1].type == BLENDING_REGION && + regions[best_reg + 1].type == BLENDING_REGION) { + if (best_j + offset == regions[best_reg].start && + best_j + offset < regions[best_reg].last) { + best_j += 1; + } else if (best_j + offset == regions[best_reg].last && + best_j + offset > regions[best_reg].start) { + best_j -= 1; } } } + + if (cur_last - best_j < 2) best_j = cur_last; + if (best_j > 0 && best_score > 0.1) cur_last = best_j; // if cannot find anything, just cut at the original place. } } @@ -2081,11 +2089,11 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, // reset pointers to the shrinked location twopass->stats_in = start_pos + cur_last; cur_start = cur_last; - if (regions[find_regions_index(regions, num_regions, - cur_start + 1 + offset)] - .type == SCENECUT_REGION) { - cur_start++; - } + int cur_region_idx = + find_regions_index(regions, num_regions, cur_start + 1 + offset); + if (cur_region_idx >= 0) + if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++; + i = cur_last; if (cut_here > 1 && cur_last == ori_last) break; @@ -2099,9 +2107,9 @@ static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length, // save intervals rc->intervals_till_gf_calculate_due = count_cuts - 1; for (int n = 1; n < count_cuts; n++) { - rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1]; + p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1]; } - rc->cur_gf_index = 0; + p_rc->cur_gf_index = 0; twopass->stats_in = start_pos; } @@ -2110,12 +2118,13 @@ static void correct_frames_to_key(AV1_COMP *cpi) { (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); if (lookahead_size < av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) { - assert(IMPLIES(cpi->oxcf.pass != 0 && cpi->frames_left > 0, - lookahead_size == cpi->frames_left)); + assert(IMPLIES(cpi->oxcf.pass != 0 && cpi->ppi->frames_left > 0, + lookahead_size == cpi->ppi->frames_left)); cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size); - } else if (cpi->frames_left > 0) { + } else if (cpi->ppi->frames_left > 0) { // Correct frames to key based on limit - cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, cpi->frames_left); + cpi->rc.frames_to_key = + AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left); } } @@ -2129,11 +2138,12 @@ static void correct_frames_to_key(AV1_COMP *cpi) { * * \param[in] cpi Top-level encoder structure * - * \return Nothing is returned. Instead, cpi->gf_group is changed. + * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed. */ static void define_gf_group_pass0(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; - GF_GROUP *const gf_group = &cpi->gf_group; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const GFConfig *const gf_cfg = &oxcf->gf_cfg; int target; @@ -2141,28 +2151,28 @@ static void define_gf_group_pass0(AV1_COMP *cpi) { if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) { av1_cyclic_refresh_set_golden_update(cpi); } else { - rc->baseline_gf_interval = rc->gf_intervals[rc->cur_gf_index]; + p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index]; rc->intervals_till_gf_calculate_due--; - rc->cur_gf_index++; + p_rc->cur_gf_index++; } // correct frames_to_key when lookahead queue is flushing correct_frames_to_key(cpi); - if (rc->baseline_gf_interval > rc->frames_to_key) - rc->baseline_gf_interval = rc->frames_to_key; + if (p_rc->baseline_gf_interval > rc->frames_to_key) + p_rc->baseline_gf_interval = rc->frames_to_key; - rc->gfu_boost = DEFAULT_GF_BOOST; - rc->constrained_gf_group = - (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; + p_rc->gfu_boost = DEFAULT_GF_BOOST; + p_rc->constrained_gf_group = + (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height; // Rare case when the look-ahead is less than the target GOP length, can't // generate ARF frame. - if (rc->baseline_gf_interval > gf_cfg->lag_in_frames || + if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames || !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) || - rc->baseline_gf_interval < rc->min_gf_interval) + p_rc->baseline_gf_interval < rc->min_gf_interval) gf_group->max_layer_depth_allowed = 0; // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) @@ -2194,7 +2204,8 @@ static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position, int use_alt_ref, int is_final_pass) { RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; // Set the interval until the next gf. // If forward keyframes are enabled, ensure the final gf group obeys the // MIN_FWD_KF_INTERVAL. @@ -2203,27 +2214,28 @@ static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position, twopass->stats_buf_ctx->stats_in_end; if (cpi->oxcf.kf_cfg.fwd_kf_enabled && use_alt_ref && !is_last_kf && - cpi->rc.next_is_fwd_key) { + cpi->ppi->p_rc.next_is_fwd_key) { if (arf_position == rc->frames_to_key + 1) { - rc->baseline_gf_interval = arf_position; + p_rc->baseline_gf_interval = arf_position; // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL } else if (rc->frames_to_key + 1 - arf_position < AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) { // if possible, merge the last two gf groups if (rc->frames_to_key + 1 <= active_max_gf_interval) { - rc->baseline_gf_interval = rc->frames_to_key + 1; + p_rc->baseline_gf_interval = rc->frames_to_key + 1; if (is_final_pass) rc->intervals_till_gf_calculate_due = 0; // if merging the last two gf groups creates a group that is too long, // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL } else { - rc->baseline_gf_interval = rc->frames_to_key + 1 - MIN_FWD_KF_INTERVAL; + p_rc->baseline_gf_interval = + rc->frames_to_key + 1 - MIN_FWD_KF_INTERVAL; if (is_final_pass) rc->intervals_till_gf_calculate_due = 0; } } else { - rc->baseline_gf_interval = arf_position; + p_rc->baseline_gf_interval = arf_position; } } else { - rc->baseline_gf_interval = arf_position; + p_rc->baseline_gf_interval = arf_position; } } @@ -2269,18 +2281,19 @@ static void init_gf_stats(GF_GROUP_STATS *gf_stats) { * \param[in] is_final_pass Whether this is the final pass for the * GF group, or a trial (non-zero) * - * \return Nothing is returned. Instead, cpi->gf_group is changed. + * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed. */ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, EncodeFrameParams *frame_params, int max_gop_length, int is_final_pass) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS next_frame; const FIRSTPASS_STATS *const start_pos = twopass->stats_in; - GF_GROUP *gf_group = &cpi->gf_group; + GF_GROUP *gf_group = &cpi->ppi->gf_group; FRAME_INFO *frame_info = &cpi->frame_info; const GFConfig *const gf_cfg = &oxcf->gf_cfg; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; @@ -2289,12 +2302,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, int64_t gf_group_bits; const int is_intra_only = rc->frames_since_key == 0; - cpi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1); + cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1); // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. if (!is_intra_only) { - av1_zero(cpi->gf_group); + av1_zero(cpi->ppi->gf_group); + cpi->gf_frame_index = 0; } aom_clear_system_state(); @@ -2306,7 +2320,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, } // correct frames_to_key when lookahead queue is emptying - if (cpi->lap_enabled) { + if (cpi->ppi->lap_enabled) { correct_frames_to_key(cpi); } @@ -2336,8 +2350,8 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, AOMMIN(rc->max_gf_interval, max_gop_length); i = is_intra_only; - // get the determined gf group length from rc->gf_intervals - while (i < rc->gf_intervals[rc->cur_gf_index]) { + // get the determined gf group length from p_rc->gf_intervals + while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) { // read in the next frame if (EOF == input_stats(twopass, &next_frame)) break; // Accumulate error score of frames in this gf group. @@ -2360,7 +2374,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, i = is_intra_only; input_stats(twopass, &next_frame); - while (i < rc->gf_intervals[rc->cur_gf_index]) { + while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) { // read in the next frame if (EOF == input_stats(twopass, &next_frame)) break; @@ -2369,13 +2383,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, flash_detected = detect_flash(twopass, 0); // accumulate stats for next frame - accumulate_next_frame_stats(&next_frame, frame_info, flash_detected, + accumulate_next_frame_stats(&next_frame, flash_detected, rc->frames_since_key, i, &gf_stats); ++i; } - i = rc->gf_intervals[rc->cur_gf_index]; + i = p_rc->gf_intervals[p_rc->cur_gf_index]; // save the errs for the last frame last_frame_stats.frame_coded_error = next_frame.coded_error; @@ -2384,11 +2398,11 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, if (is_final_pass) { rc->intervals_till_gf_calculate_due--; - rc->cur_gf_index++; + p_rc->cur_gf_index++; } // Was the group length constrained by the requirement for a new KF? - rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; + p_rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE) ? cpi->initial_mbs @@ -2407,32 +2421,34 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION && gf_stats.avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) { - cpi->internal_altref_allowed = 0; + cpi->ppi->internal_altref_allowed = 0; } int use_alt_ref; if (can_disable_arf) { use_alt_ref = !is_almost_static(gf_stats.zero_motion_accumulator, - twopass->kf_zeromotion_pct, cpi->lap_enabled) && - rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && + twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) && + p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && (i >= MIN_GF_INTERVAL); + FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats; // TODO(urvang): Improve and use model for VBR, CQ etc as well. - if (use_alt_ref && rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 200) { + if (use_alt_ref && use_ml_model_to_decide_flat_gop(rc_cfg) && + !is_fp_stats_to_predict_flat_gop_invalid(total_stats)) { aom_clear_system_state(); float features[21]; get_features_from_gf_stats( &gf_stats, &first_frame_stats, &last_frame_stats, num_mbs, - rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features); + p_rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features); // Infer using ML model. float score; av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score); use_alt_ref = (score <= 0.0); } } else { - use_alt_ref = - rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && (i > 2); + use_alt_ref = p_rc->use_arf_in_this_kf_group && + (i < gf_cfg->lag_in_frames) && (i > 2); } #define REDUCE_GF_LENGTH_THRESH 4 @@ -2443,7 +2459,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, // work well for certain other cases. const int allow_gf_length_reduction = ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) || - !cpi->internal_altref_allowed) && + !cpi->ppi->internal_altref_allowed) && !is_lossless_requested(rc_cfg); if (allow_gf_length_reduction && use_alt_ref) { @@ -2485,48 +2501,48 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, : AOMMAX(0, rc->frames_to_key - i); // Calculate the boost for alt ref. - rc->gfu_boost = av1_calc_arf_boost( - twopass, rc, frame_info, alt_offset, forward_frames, ext_len, - cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL, - cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL); + p_rc->gfu_boost = av1_calc_arf_boost( + twopass, p_rc, rc, frame_info, alt_offset, forward_frames, ext_len, + &p_rc->num_stats_used_for_gfu_boost, + &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled); } else { reset_fpf_position(twopass, start_pos); gf_group->max_layer_depth_allowed = 0; set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref, is_final_pass); - rc->gfu_boost = AOMMIN( + p_rc->gfu_boost = AOMMIN( MAX_GF_BOOST, - av1_calc_arf_boost( - twopass, rc, frame_info, alt_offset, ext_len, 0, - cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL, - cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL)); + av1_calc_arf_boost(twopass, p_rc, rc, frame_info, alt_offset, ext_len, + 0, &p_rc->num_stats_used_for_gfu_boost, + &p_rc->num_stats_required_for_gfu_boost, + cpi->ppi->lap_enabled)); } #define LAST_ALR_BOOST_FACTOR 0.2f - rc->arf_boost_factor = 1.0; + p_rc->arf_boost_factor = 1.0; if (use_alt_ref && !is_lossless_requested(rc_cfg)) { // Reduce the boost of altref in the last gf group if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY || rc->frames_to_key - ext_len == 0) { - rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; + p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; } } - rc->frames_till_gf_update_due = rc->baseline_gf_interval; + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; // Reset the file position. reset_fpf_position(twopass, start_pos); - if (cpi->lap_enabled) { + if (cpi->ppi->lap_enabled) { // Since we don't have enough stats to know the actual error of the // gf group, we assume error of each frame to be equal to 1 and set // the error of the group as baseline_gf_interval. - gf_stats.gf_group_err = rc->baseline_gf_interval; + gf_stats.gf_group_err = p_rc->baseline_gf_interval; } // Calculate the bits to be allocated to the gf/arf group as a whole gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats.gf_group_err); - rc->gf_group_bits = gf_group_bits; + p_rc->gf_group_bits = gf_group_bits; #if GROUP_ADAPTIVE_MAXQ // Calculate an estimate of the maxq needed for the group. @@ -2534,17 +2550,17 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, // where there could be significant overshoot than for easier // sections where we do not wish to risk creating an overshoot // of the allocated bit budget. - if ((rc_cfg->mode != AOM_Q) && (rc->baseline_gf_interval > 1) && + if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) && is_final_pass) { const int vbr_group_bits_per_frame = - (int)(gf_group_bits / rc->baseline_gf_interval); + (int)(gf_group_bits / p_rc->baseline_gf_interval); const double group_av_err = - gf_stats.gf_group_raw_error / rc->baseline_gf_interval; + gf_stats.gf_group_raw_error / p_rc->baseline_gf_interval; const double group_av_skip_pct = - gf_stats.gf_group_skip_pct / rc->baseline_gf_interval; + gf_stats.gf_group_skip_pct / p_rc->baseline_gf_interval; const double group_av_inactive_zone = ((gf_stats.gf_group_inactive_zone_rows * 2) / - (rc->baseline_gf_interval * (double)cm->mi_params.mb_rows)); + (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows)); int tmp_q; tmp_q = get_twopass_worst_quality( @@ -2568,7 +2584,7 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, if (rc->frames_since_key != 0) { twopass->section_intra_rating = calculate_section_intra_ratio( start_pos, twopass->stats_buf_ctx->stats_in_end, - rc->baseline_gf_interval); + p_rc->baseline_gf_interval); } av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0, @@ -2577,12 +2593,12 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, frame_params->frame_type = rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME; frame_params->show_frame = - !(gf_group->update_type[gf_group->index] == ARF_UPDATE || - gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE); + !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE); // TODO(jingning): Generalize this condition. if (is_final_pass) { - cpi->gf_state.arf_gf_boost_lst = use_alt_ref; + cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref; // Reset rolling actual and target bits counters for ARF groups. twopass->rolling_arf_group_target_bits = 1; @@ -2597,12 +2613,13 @@ static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, GF_GROUP *gf_group, int is_key_frame, int use_arf, int64_t gf_group_bits) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; // Calculate the extra bits to be used for boosted frame(s) #ifdef FIXED_ARF_BITS int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits); #else int gf_arf_bits = calculate_boost_bits( - rc->baseline_gf_interval - (rc->frames_since_key == 0), rc->gfu_boost, + p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost, gf_group_bits); #endif @@ -2610,8 +2627,8 @@ void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc, gf_group_bits, 1); // Allocate bits to each of the frames in the GF group. - allocate_gf_group_bits(gf_group, rc, gf_group_bits, gf_arf_bits, is_key_frame, - use_arf); + allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits, + is_key_frame, use_arf); } // Minimum % intra coding observed in first pass (1.0 = 100%) @@ -2786,10 +2803,10 @@ static int test_candidate_kf(TWO_PASS *twopass, #define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval static int detect_app_forced_key(AV1_COMP *cpi) { - if (cpi->oxcf.kf_cfg.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1; + if (cpi->oxcf.kf_cfg.fwd_kf_enabled) cpi->ppi->p_rc.next_is_fwd_key = 1; int num_frames_to_app_forced_key = is_forced_keyframe_pending( cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage); - if (num_frames_to_app_forced_key != -1) cpi->rc.next_is_fwd_key = 0; + if (num_frames_to_app_forced_key != -1) cpi->ppi->p_rc.next_is_fwd_key = 0; return num_frames_to_app_forced_key; } @@ -2799,16 +2816,16 @@ static int get_projected_kf_boost(AV1_COMP *cpi) { * all stats needed for prior boost calculation are available. * Hence projecting the prior boost is not needed in this cases. */ - if (cpi->rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key) - return cpi->rc.kf_boost; + if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key) + return cpi->ppi->p_rc.kf_boost; // Get the current tpl factor (number of frames = frames_to_key). double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key); // Get the tpl factor when number of frames = num_stats_used_for_kf_boost. - double tpl_factor_num_stats = - av1_get_kf_boost_projection_factor(cpi->rc.num_stats_used_for_kf_boost); + double tpl_factor_num_stats = av1_get_kf_boost_projection_factor( + cpi->ppi->p_rc.num_stats_used_for_kf_boost); int projected_kf_boost = - (int)rint((tpl_factor * cpi->rc.kf_boost) / tpl_factor_num_stats); + (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats); return projected_kf_boost; } @@ -2828,8 +2845,9 @@ static int get_projected_kf_boost(AV1_COMP *cpi) { static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, double *kf_group_err, int num_frames_to_detect_scenecut) { - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; @@ -2874,7 +2892,7 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, input_stats(twopass, this_frame); // Provided that we are not at the end of the file... - if ((cpi->rc.enable_scenecut_detection > 0) && kf_cfg->auto_key && + if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key && twopass->stats_in < twopass->stats_buf_ctx->stats_in_end) { double loop_decay_rate; @@ -2882,14 +2900,13 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, if (frames_since_key >= kf_cfg->key_freq_min && test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in, frames_since_key, oxcf->rc_cfg.mode, - cpi->rc.enable_scenecut_detection)) { + cpi->ppi->p_rc.enable_scenecut_detection)) { scenecut_detected = 1; break; } // How fast is the prediction quality decaying? - loop_decay_rate = - get_prediction_decay_rate(frame_info, twopass->stats_in); + loop_decay_rate = get_prediction_decay_rate(twopass->stats_in); // We want to know something about the recent past... rather than // as used elsewhere where we are concerned with decay in prediction @@ -2909,7 +2926,7 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, // In the case of transition followed by a static scene, the key frame // could be a good predictor for the following frames, therefore we // do not use an arf. - rc->use_arf_in_this_kf_group = 0; + p_rc->use_arf_in_this_kf_group = 0; break; } @@ -2928,14 +2945,14 @@ static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, } if (kf_group_err != NULL) - rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost; + p_rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost; - if (cpi->lap_enabled && !scenecut_detected) + if (cpi->ppi->lap_enabled && !scenecut_detected) frames_to_key = num_frames_to_next_key; if (!kf_cfg->fwd_kf_enabled || scenecut_detected || twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) - rc->next_is_fwd_key = 0; + p_rc->next_is_fwd_key = 0; return frames_to_key; } @@ -2964,9 +2981,9 @@ static double get_kf_group_avg_error(TWO_PASS *twopass, static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err, double kf_group_avg_error) { RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; int64_t kf_group_bits; - if (cpi->lap_enabled) { + if (cpi->ppi->lap_enabled) { kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth; if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) { const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) @@ -2990,7 +3007,7 @@ static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err, static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) { RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS cur_frame; av1_zero(cur_frame); int num_frames = 0; @@ -3039,7 +3056,7 @@ static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err, double *zero_motion_accumulator, double *sr_accumulator, int use_avg_stat) { RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; FRAME_INFO *const frame_info = &cpi->frame_info; FIRSTPASS_STATS frame_stat; av1_zero(frame_stat); @@ -3061,8 +3078,7 @@ static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err, // For the first frame in kf group, the second ref indicator is invalid. if (i > 0) { *zero_motion_accumulator = - AOMMIN(*zero_motion_accumulator, - get_zero_motion_factor(frame_info, &frame_stat)); + AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat)); } else { *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion; } @@ -3102,8 +3118,9 @@ static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err, */ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &cpi->gf_group; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; FRAME_INFO *const frame_info = &cpi->frame_info; AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; @@ -3115,27 +3132,26 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->frames_since_key = 0; // Use arfs if possible. - rc->use_arf_in_this_kf_group = is_altref_enabled( + p_rc->use_arf_in_this_kf_group = is_altref_enabled( oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf); // Reset the GF group data structures. av1_zero(*gf_group); + cpi->gf_frame_index = 0; // KF is always a GF so clear frames till next gf counter. rc->frames_till_gf_update_due = 0; - rc->frames_to_key = 1; - if (has_no_stats_stage(cpi)) { int num_frames_to_app_forced_key = detect_app_forced_key(cpi); - rc->this_key_frame_forced = + p_rc->this_key_frame_forced = current_frame->frame_number != 0 && rc->frames_to_key == 0; if (num_frames_to_app_forced_key != -1) rc->frames_to_key = num_frames_to_app_forced_key; else rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max); correct_frames_to_key(cpi); - rc->kf_boost = DEFAULT_KF_BOOST; + p_rc->kf_boost = DEFAULT_KF_BOOST; gf_group->update_type[0] = KF_UPDATE; return; } @@ -3153,7 +3169,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { int64_t kf_group_bits_clipped = INT64_MAX; // Is this a forced key frame by interval. - rc->this_key_frame_forced = rc->next_key_frame_forced; + p_rc->this_key_frame_forced = p_rc->next_key_frame_forced; twopass->kf_group_bits = 0; // Total bits available to kf group twopass->kf_group_error_left = 0; // Group modified error score. @@ -3169,7 +3185,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { else rc->frames_to_key = kf_cfg->key_freq_max; - if (cpi->lap_enabled) correct_frames_to_key(cpi); + if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi); // If there is a max kf interval set by the user we must obey it. // We already breakout of the loop above at 2x max. @@ -3191,28 +3207,29 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { calculate_modified_err(frame_info, twopass, oxcf, &tmp_frame); if (EOF == input_stats(twopass, &tmp_frame)) break; } - rc->next_key_frame_forced = 1; + p_rc->next_key_frame_forced = 1; } else if ((twopass->stats_in == twopass->stats_buf_ctx->stats_in_end && is_stat_consumption_stage_twopass(cpi)) || rc->frames_to_key >= kf_cfg->key_freq_max) { - rc->next_key_frame_forced = 1; + p_rc->next_key_frame_forced = 1; } else { - rc->next_key_frame_forced = 0; + p_rc->next_key_frame_forced = 0; } - if (kf_cfg->fwd_kf_enabled) rc->next_is_fwd_key |= rc->next_key_frame_forced; + if (kf_cfg->fwd_kf_enabled) + p_rc->next_is_fwd_key |= p_rc->next_key_frame_forced; // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) { // Accumulate kf group error. kf_group_err += calculate_modified_err(frame_info, twopass, oxcf, this_frame); - rc->next_is_fwd_key = 0; + p_rc->next_is_fwd_key = 0; } // Calculate the number of bits that should be assigned to the kf group. if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) || - (cpi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) { + (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) { // Maximum number of bits for a single normal frame (not key frame). const int max_bits = frame_max_bits(rc, oxcf); @@ -3237,7 +3254,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { } twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits); - if (cpi->lap_enabled) { + if (cpi->ppi->lap_enabled) { // In the case of single pass based on LAP, frames to key may have an // inaccurate value, and hence should be clipped to an appropriate // interval. @@ -3268,17 +3285,17 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key); - rc->kf_boost = (int)boost_score; + p_rc->kf_boost = (int)boost_score; - if (cpi->lap_enabled) { + if (cpi->ppi->lap_enabled) { if (oxcf->rc_cfg.mode == AOM_Q) { - rc->kf_boost = get_projected_kf_boost(cpi); + p_rc->kf_boost = get_projected_kf_boost(cpi); } else { // TODO(any): Explore using average frame stats for AOM_Q as well. boost_score = get_kf_boost_score( cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1); reset_fpf_position(twopass, start_position); - rc->kf_boost += (int)boost_score; + p_rc->kf_boost += (int)boost_score; } } @@ -3286,13 +3303,13 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // if the kf group is very short. if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) && (rc->frames_to_key > 8)) { - rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST); + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST); } else { // Apply various clamps for min and max boost - rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3)); - rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST); + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3)); + p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST); #ifdef STRICT_RC - rc->kf_boost = AOMMIN(rc->kf_boost, MAX_KF_BOOST); + p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST); #endif } @@ -3301,9 +3318,10 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { // very high, we calculate the bits based on a clipped value of // frames_to_key. kf_bits = calculate_boost_bits( - AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, rc->kf_boost, + AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost, AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped)); - // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost, + // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", + // p_rc->kf_boost, // kf_bits, twopass->kf_zeromotion_pct); kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits, twopass->kf_group_bits, 0); @@ -3315,7 +3333,7 @@ static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group->update_type[0] = KF_UPDATE; // Note the total error score of the kf group minus the key frame itself. - if (cpi->lap_enabled) + if (cpi->ppi->lap_enabled) // As we don't have enough stats to know the actual error of the group, // we assume the complexity of each frame to be equal to 1, and set the // error as the number of frames in the group(minus the keyframe). @@ -3335,7 +3353,7 @@ static int is_skippable_frame(const AV1_COMP *cpi) { // first pass, and so do its previous and forward frames, then this frame // can be skipped for partition check, and the partition size is assigned // according to the variance - const TWO_PASS *const twopass = &cpi->twopass; + const TWO_PASS *const twopass = &cpi->ppi->twopass; return (!frame_is_intra_only(&cpi->common) && twopass->stats_in - 2 > twopass->stats_buf_ctx->stats_in_start && @@ -3358,34 +3376,78 @@ static int get_section_target_bandwidth(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; int section_target_bandwidth; const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count - current_frame->frame_number); - if (cpi->lap_enabled) + if (cpi->ppi->lap_enabled) section_target_bandwidth = (int)rc->avg_frame_bandwidth; else section_target_bandwidth = (int)(twopass->bits_left / frames_left); return section_target_bandwidth; } +static INLINE void set_twopass_params_based_on_fp_stats( + const AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) { + if (this_frame_ptr == NULL) return; + + TWO_PASS *const twopass = &cpi->ppi->twopass; + const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.mi_params.MBs; + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame_ptr->intra_error / num_mbs) + 1.0); + + const FIRSTPASS_STATS *const total_stats = + twopass->stats_buf_ctx->total_stats; + if (is_fp_wavelet_energy_invalid(total_stats) == 0) { + twopass->frame_avg_haar_energy = + log((this_frame_ptr->frame_avg_wavelet_energy / num_mbs) + 1.0); + } + + // Set the frame content type flag. + if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH) + twopass->fr_content_type = FC_GRAPHICS_ANIMATION; + else + twopass->fr_content_type = FC_NORMAL; +} + static void process_first_pass_stats(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { AV1_COMMON *const cm = &cpi->common; CurrentFrame *const current_frame = &cm->current_frame; RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; + FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats; + + if (current_frame->frame_number == 0) { + const GFConfig *const gf_cfg = &cpi->oxcf.gf_cfg; + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + if (use_ml_model_to_decide_flat_gop(rc_cfg) && can_disable_altref(gf_cfg) && + is_fp_stats_to_predict_flat_gop_invalid(total_stats)) { + // warn( + // "First pass stats required in the ML model to predict a flat GOP " + // "structure is invalid. Continuing encoding by disabling the ML " + // "model.\n"); + // The first pass statistics like tr_coded_error, pcnt_third_ref, + // frame_avg_wavelet_energy are invalid as their calculations were + // skipped in the first pass of encoding. As these stats are required + // in the ML model to predict a flat GOP structure, the ML model would be + // disabled. This case arises when the encode configuration used in first + // pass encoding is different from second pass encoding. + } + } if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 && - cpi->gf_group.index == 0 && cpi->twopass.stats_buf_ctx->total_stats && - cpi->twopass.stats_buf_ctx->total_left_stats) { - if (cpi->lap_enabled) { + cpi->gf_frame_index == 0 && total_stats && + cpi->ppi->twopass.stats_buf_ctx->total_left_stats) { + if (cpi->ppi->lap_enabled) { /* * Accumulate total_stats using available limited number of stats, * and assign it to total_left_stats. */ - *cpi->twopass.stats_buf_ctx->total_left_stats = - *cpi->twopass.stats_buf_ctx->total_stats; + *cpi->ppi->twopass.stats_buf_ctx->total_left_stats = *total_stats; } // Special case code for first frame. const int section_target_bandwidth = get_section_target_bandwidth(cpi); @@ -3406,43 +3468,25 @@ static void process_first_pass_stats(AV1_COMP *cpi, rc->active_worst_quality = tmp_q; rc->ni_av_qi = tmp_q; rc->last_q[INTER_FRAME] = tmp_q; - rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth); + rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth); rc->avg_frame_qindex[INTER_FRAME] = tmp_q; rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2; rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME]; } - int err = 0; - if (cpi->lap_enabled) { - err = input_stats_lap(twopass, this_frame); + if (cpi->ppi->lap_enabled) { + input_stats_lap(twopass, this_frame); } else { - err = input_stats(twopass, this_frame); - } - if (err == EOF) return; - - { - const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cm->mi_params.MBs; - // The multiplication by 256 reverses a scaling factor of (>> 8) - // applied when combining MB error values for the frame. - twopass->mb_av_energy = log((this_frame->intra_error / num_mbs) + 1.0); - twopass->frame_avg_haar_energy = - log((this_frame->frame_avg_wavelet_energy / num_mbs) + 1.0); + input_stats(twopass, this_frame); } - - // Set the frame content type flag. - if (this_frame->intra_skip_pct >= FC_ANIMATION_THRESH) - twopass->fr_content_type = FC_GRAPHICS_ANIMATION; - else - twopass->fr_content_type = FC_NORMAL; + set_twopass_params_based_on_fp_stats(cpi, this_frame); } static void setup_target_rate(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; - GF_GROUP *const gf_group = &cpi->gf_group; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; - int target_rate = gf_group->bit_allocation[gf_group->index]; + int target_rate = gf_group->bit_allocation[cpi->gf_frame_index]; if (has_no_stats_stage(cpi)) { av1_rc_set_frame_target(cpi, target_rate, cpi->common.width, @@ -3452,24 +3496,160 @@ static void setup_target_rate(AV1_COMP *cpi) { rc->base_frame_target = target_rate; } +static void mark_flashes(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *this_stats = first_stats, *next_stats; + while (this_stats < last_stats - 1) { + next_stats = this_stats + 1; + if (next_stats->pcnt_second_ref > next_stats->pcnt_inter && + next_stats->pcnt_second_ref >= 0.5) { + this_stats->is_flash = 1; + } else { + this_stats->is_flash = 0; + } + this_stats = next_stats; + } + // We always treat the last one as none flash. + if (last_stats - 1 >= first_stats) { + (last_stats - 1)->is_flash = 0; + } +} + +// Estimate the noise variance of each frame from the first pass stats +static void estimate_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *this_stats, *next_stats; + double C1, C2, C3, noise; + int count = 0; + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + this_stats->noise_var = 0.0; + // flashes tend to have high correlation of innovations, so ignore them. + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) + continue; + + C1 = (this_stats - 1)->intra_error * + (this_stats->intra_error - this_stats->coded_error); + C2 = (this_stats - 2)->intra_error * + ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error); + C3 = (this_stats - 2)->intra_error * + (this_stats->intra_error - this_stats->sr_coded_error); + if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue; + C1 = sqrt(C1); + C2 = sqrt(C2); + C3 = sqrt(C3); + + noise = (this_stats - 1)->intra_error - C1 * C2 / C3; + noise = AOMMAX(noise, 0.01); + this_stats->noise_var = noise; + count++; + } + + // Copy noise from the neighbor if the noise value is not trustworthy + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) + continue; + if (this_stats->noise_var < 1.0) { + int found = 0; + // TODO(bohanli): consider expanding to two directions at the same time + for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash || next_stats->noise_var < 1.0) + continue; + found = 1; + this_stats->noise_var = next_stats->noise_var; + break; + } + if (found) continue; + for (next_stats = this_stats - 1; next_stats >= first_stats + 2; + next_stats--) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash || next_stats->noise_var < 1.0) + continue; + this_stats->noise_var = next_stats->noise_var; + break; + } + } + } + + // copy the noise if this is a flash + for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { + if (this_stats->is_flash || (this_stats - 1)->is_flash || + (this_stats - 2)->is_flash) { + int found = 0; + for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash) + continue; + found = 1; + this_stats->noise_var = next_stats->noise_var; + break; + } + if (found) continue; + for (next_stats = this_stats - 1; next_stats >= first_stats + 2; + next_stats--) { + if (next_stats->is_flash || (next_stats - 1)->is_flash || + (next_stats - 2)->is_flash) + continue; + this_stats->noise_var = next_stats->noise_var; + break; + } + } + } + + // if we are at the first 2 frames, copy the noise + for (this_stats = first_stats; + this_stats < first_stats + 2 && (first_stats + 2) < last_stats; + this_stats++) { + this_stats->noise_var = (first_stats + 2)->noise_var; + } +} + +// Estimate correlation coefficient of each frame with its previous frame. +static void estimate_coeff(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *this_stats; + for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) { + const double C = + sqrt(AOMMAX((this_stats - 1)->intra_error * + (this_stats->intra_error - this_stats->coded_error), + 0.001)); + const double cor_coeff = + C / + AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001); + + this_stats->cor_coeff = + cor_coeff * + sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, + 0.001) / + AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001)); + // clip correlation coefficient. + this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1); + } + first_stats->cor_coeff = 1.0; +} + void av1_get_second_pass_params(AV1_COMP *cpi, EncodeFrameParams *const frame_params, const EncodeFrameInput *const frame_input, unsigned int frame_flags) { RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &cpi->gf_group; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + TWO_PASS *const twopass = &cpi->ppi->twopass; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const FIRSTPASS_STATS *const start_pos = twopass->stats_in; if (is_stat_consumption_stage(cpi) && !twopass->stats_in) return; - const int update_type = gf_group->update_type[gf_group->index]; - frame_params->frame_type = gf_group->frame_type[gf_group->index]; + assert(twopass->stats_in != NULL); + const int update_type = gf_group->update_type[cpi->gf_frame_index]; + frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; - if (gf_group->index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) { - assert(gf_group->index < gf_group->size); + if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) { + assert(cpi->gf_frame_index < gf_group->size); setup_target_rate(cpi); @@ -3481,6 +3661,9 @@ void av1_get_second_pass_params(AV1_COMP *cpi, if (cpi->sf.part_sf.allow_partition_search_skip && oxcf->pass == 2) { cpi->partition_search_skippable_frame = is_skippable_frame(cpi); } + const FIRSTPASS_STATS *const this_frame_ptr = read_frame_stats( + twopass, gf_group->arf_src_offset[cpi->gf_frame_index]); + set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr); return; } } @@ -3493,7 +3676,7 @@ void av1_get_second_pass_params(AV1_COMP *cpi, av1_zero(this_frame); // call above fn if (is_stat_consumption_stage(cpi)) { - if (gf_group->index < gf_group->size || rc->frames_to_key == 0) + if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) process_first_pass_stats(cpi, &this_frame); } else { rc->active_worst_quality = oxcf->rc_cfg.cq_level; @@ -3504,7 +3687,7 @@ void av1_get_second_pass_params(AV1_COMP *cpi, this_frame_copy = this_frame; int is_overlay_forward_kf = rc->frames_to_key == 0 && - gf_group->update_type[gf_group->index] == OVERLAY_UPDATE; + gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE; if (rc->frames_to_key <= 0 && !is_overlay_forward_kf) { assert(rc->frames_to_key >= -1); // Define next KF group and assign bits to it. @@ -3554,12 +3737,12 @@ void av1_get_second_pass_params(AV1_COMP *cpi, } // Define a new GF/ARF group. (Should always enter here for key frames). - if (gf_group->index == gf_group->size) { + if (cpi->gf_frame_index == gf_group->size) { assert(cpi->common.current_frame.frame_number == 0 || - gf_group->index == gf_group->size); + cpi->gf_frame_index == gf_group->size); const FIRSTPASS_STATS *const start_position = twopass->stats_in; - if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection) { + if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) { int num_frames_to_detect_scenecut, frames_to_key; num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1; frames_to_key = define_kf_interval(cpi, &this_frame, NULL, @@ -3578,41 +3761,45 @@ void av1_get_second_pass_params(AV1_COMP *cpi, : MAX_GF_LENGTH_LAP; // Identify regions if needed. + // TODO(bohanli): identify regions for all stats available. if (rc->frames_since_key == 0 || rc->frames_since_key == 1 || - (rc->frames_till_regions_update - rc->frames_since_key < + (p_rc->frames_till_regions_update - rc->frames_since_key < rc->frames_to_key && - rc->frames_till_regions_update - rc->frames_since_key < + p_rc->frames_till_regions_update - rc->frames_since_key < max_gop_length + 1)) { - int is_first_stat = - twopass->stats_in == twopass->stats_buf_ctx->stats_in_start; - const FIRSTPASS_STATS *stats_start = twopass->stats_in + is_first_stat; - // offset of stats_start from the current frame - int offset = is_first_stat || (rc->frames_since_key == 0); - // offset of the region indices from the previous key frame - rc->regions_offset = rc->frames_since_key; // how many frames we can analyze from this frame - int rest_frames = AOMMIN(rc->frames_to_key + rc->next_is_fwd_key, + int rest_frames = AOMMIN(rc->frames_to_key + p_rc->next_is_fwd_key, MAX_FIRSTPASS_ANALYSIS_FRAMES); - rest_frames = - AOMMIN(rest_frames, - (int)(twopass->stats_buf_ctx->stats_in_end - stats_start + 1) + - offset); - - rc->frames_till_regions_update = rest_frames; - - identify_regions(stats_start, rest_frames - offset, offset, rc->regions, - &rc->num_regions, rc->cor_coeff); + rest_frames = AOMMIN( + rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end - + twopass->stats_in + (rc->frames_since_key == 0))); + p_rc->frames_till_regions_update = rest_frames; + + if (cpi->ppi->lap_enabled) { + mark_flashes(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + estimate_noise(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + estimate_coeff(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + identify_regions(twopass->stats_in, rest_frames, + (rc->frames_since_key == 0), p_rc->regions, + &p_rc->num_regions); + } else { + identify_regions(twopass->stats_in - (rc->frames_since_key == 0), + rest_frames, 0, p_rc->regions, &p_rc->num_regions); + } } int cur_region_idx = - find_regions_index(rc->regions, rc->num_regions, - rc->frames_since_key - rc->regions_offset); + find_regions_index(p_rc->regions, p_rc->num_regions, + rc->frames_since_key - p_rc->regions_offset); if ((cur_region_idx >= 0 && - rc->regions[cur_region_idx].type == SCENECUT_REGION) || + p_rc->regions[cur_region_idx].type == SCENECUT_REGION) || rc->frames_since_key == 0) { // If we start from a scenecut, then the last GOP's arf boost is not // needed for this GOP. - cpi->gf_state.arf_gf_boost_lst = 0; + cpi->ppi->gf_state.arf_gf_boost_lst = 0; } // TODO(jingning): Resoleve the redundant calls here. @@ -3621,62 +3808,49 @@ void av1_get_second_pass_params(AV1_COMP *cpi, } if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model && - !cpi->sf.tpl_sf.disable_gop_length_decision) { - int this_idx = rc->frames_since_key + rc->gf_intervals[rc->cur_gf_index] - - rc->regions_offset - 1; + cpi->sf.tpl_sf.gop_length_decision_method != 3) { + int this_idx = rc->frames_since_key + + p_rc->gf_intervals[p_rc->cur_gf_index] - + p_rc->regions_offset - 1; int this_region = - find_regions_index(rc->regions, rc->num_regions, this_idx); + find_regions_index(p_rc->regions, p_rc->num_regions, this_idx); int next_region = - find_regions_index(rc->regions, rc->num_regions, this_idx + 1); + find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1); int is_last_scenecut = - (rc->gf_intervals[rc->cur_gf_index] >= rc->frames_to_key || - rc->regions[this_region].type == SCENECUT_REGION || - rc->regions[next_region].type == SCENECUT_REGION); - int ori_gf_int = rc->gf_intervals[rc->cur_gf_index]; + (p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key || + p_rc->regions[this_region].type == SCENECUT_REGION || + p_rc->regions[next_region].type == SCENECUT_REGION); + int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index]; - if (rc->gf_intervals[rc->cur_gf_index] > 16) { + if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 && + rc->min_gf_interval <= 16) { // The calculate_gf_length function is previously used with // max_gop_length = 32 with look-ahead gf intervals. define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0); this_frame = this_frame_copy; - int is_temporal_filter_enabled = - (rc->frames_since_key > 0 && gf_group->arf_index > -1); - if (is_temporal_filter_enabled) { - int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index]; - FRAME_UPDATE_TYPE arf_update_type = - gf_group->update_type[gf_group->arf_index]; - int is_forward_keyframe = 0; - av1_temporal_filter(cpi, arf_src_index, arf_update_type, - is_forward_keyframe, NULL); - aom_extend_frame_borders(&cpi->alt_ref_buffer, - av1_num_planes(&cpi->common)); - } - if (!av1_tpl_setup_stats(cpi, 1, frame_params, frame_input)) { - // Tpl decides that a shorter gf interval is better. + + if (is_shorter_gf_interval_better(cpi, frame_params, frame_input)) { + // A shorter gf interval is better. // TODO(jingning): Remove redundant computations here. max_gop_length = 16; calculate_gf_length(cpi, max_gop_length, 1); if (is_last_scenecut && - (ori_gf_int - rc->gf_intervals[rc->cur_gf_index] < 4)) { - rc->gf_intervals[rc->cur_gf_index] = ori_gf_int; + (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) { + p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int; } - } else { - // Tpl stats is reused only when the ARF frame is temporally filtered - if (is_temporal_filter_enabled) - cpi->tpl_data.skip_tpl_setup_stats = 1; } } } define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0); - if (gf_group->update_type[gf_group->index] != ARF_UPDATE && + if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE && rc->frames_since_key > 0) process_first_pass_stats(cpi, &this_frame); define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1); - rc->frames_till_gf_update_due = rc->baseline_gf_interval; - assert(gf_group->index == 0); + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; + assert(cpi->gf_frame_index == 0); #if ARF_STATS_OUTPUT { FILE *fpfile; @@ -3684,18 +3858,22 @@ void av1_get_second_pass_params(AV1_COMP *cpi, ++arf_count; fprintf(fpfile, "%10d %10d %10d %10d %10d\n", cpi->common.current_frame.frame_number, - rc->frames_till_gf_update_due, rc->kf_boost, arf_count, - rc->gfu_boost); + rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count, + p_rc->gfu_boost); fclose(fpfile); } #endif } - assert(gf_group->index < gf_group->size); + assert(cpi->gf_frame_index < gf_group->size); - if (gf_group->update_type[gf_group->index] == ARF_UPDATE || - gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) { + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE || + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) { reset_fpf_position(twopass, start_pos); + + const FIRSTPASS_STATS *const this_frame_ptr = read_frame_stats( + twopass, gf_group->arf_src_offset[cpi->gf_frame_index]); + set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr); } else { // Update the total stats remaining structure. if (twopass->stats_buf_ctx->total_left_stats) @@ -3703,7 +3881,7 @@ void av1_get_second_pass_params(AV1_COMP *cpi, &this_frame_copy); } - frame_params->frame_type = gf_group->frame_type[gf_group->index]; + frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index]; // Do the firstpass stats indicate that this frame is skippable for the // partition search? @@ -3716,13 +3894,20 @@ void av1_get_second_pass_params(AV1_COMP *cpi, void av1_init_second_pass(AV1_COMP *cpi) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; FRAME_INFO *const frame_info = &cpi->frame_info; double frame_rate; FIRSTPASS_STATS *stats; if (!twopass->stats_buf_ctx->stats_in_end) return; + mark_flashes(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + estimate_noise(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + estimate_coeff(twopass->stats_buf_ctx->stats_in_start, + twopass->stats_buf_ctx->stats_in_end); + stats = twopass->stats_buf_ctx->total_stats; *stats = *twopass->stats_buf_ctx->stats_in_end; @@ -3779,7 +3964,7 @@ void av1_init_second_pass(AV1_COMP *cpi) { } void av1_init_single_pass_lap(AV1_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; if (!twopass->stats_buf_ctx->stats_in_end) return; @@ -3813,7 +3998,7 @@ void av1_init_single_pass_lap(AV1_COMP *cpi) { #define MINQ_ADJ_LIMIT_CQ 20 #define HIGH_UNDERSHOOT_RATIO 2 void av1_twopass_postencode_update(AV1_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; + TWO_PASS *const twopass = &cpi->ppi->twopass; RATE_CONTROL *const rc = &cpi->rc; const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; @@ -3840,7 +4025,8 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) { // Update the active best quality pyramid. if (!rc->is_src_frame_alt_ref) { - const int pyramid_level = cpi->gf_group.layer_depth[cpi->gf_group.index]; + const int pyramid_level = + cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]; int i; for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) { rc->active_best_quality[i] = cpi->common.quant_params.base_qindex; @@ -3871,9 +4057,9 @@ void av1_twopass_postencode_update(AV1_COMP *cpi) { (double)twopass->rolling_arf_group_target_bits, twopass->bpm_factor, av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex, - cm->seq_params.bit_depth), + cm->seq_params->bit_depth), av1_convert_qindex_to_q(rc->active_worst_quality, - cm->seq_params.bit_depth)); + cm->seq_params->bit_depth)); fclose(fpfile); } #endif diff --git a/third_party/libaom/source/libaom/av1/encoder/pickcdef.c b/third_party/libaom/source/libaom/av1/encoder/pickcdef.c index 55e466d601..f9758343dc 100644 --- a/third_party/libaom/source/libaom/av1/encoder/pickcdef.c +++ b/third_party/libaom/source/libaom/av1/encoder/pickcdef.c @@ -454,13 +454,13 @@ static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame, (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; cdef_search_ctx->nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; - cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0); + cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0); cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6); cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method]; cdef_search_ctx->num_planes = num_planes; cdef_search_ctx->pick_method = pick_method; cdef_search_ctx->sb_count = 0; - av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0, + av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0, num_planes); // Initialize plane wise information. for (int pli = 0; pli < num_planes; pli++) { @@ -478,7 +478,7 @@ static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame, } // Function pointer initialization. #if CONFIG_AV1_HIGHBITDEPTH - if (cm->seq_params.use_highbitdepth) { + if (cm->seq_params->use_highbitdepth) { cdef_search_ctx->copy_fn = copy_sb16_16_highbd; cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd; } else { @@ -491,13 +491,20 @@ static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame, #endif } -static void pick_cdef_from_qp(AV1_COMMON *const cm) { - const int bd = cm->seq_params.bit_depth; +static void pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef, + int frames_since_key) { + const int bd = cm->seq_params->bit_depth; const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8); CdefInfo *const cdef_info = &cm->cdef_info; - cdef_info->cdef_bits = 0; - cdef_info->nb_cdef_strengths = 1; + // Check the speed feature to avoid extra signaling. + if (skip_cdef) { + cdef_info->cdef_bits = 1; + cdef_info->nb_cdef_strengths = 2; + } else { + cdef_info->cdef_bits = 0; + cdef_info->nb_cdef_strengths = 1; + } cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6); int predicted_y_f1 = 0; @@ -537,13 +544,22 @@ static void pick_cdef_from_qp(AV1_COMMON *const cm) { cdef_info->cdef_uv_strengths[0] = predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2; + if (skip_cdef) { + cdef_info->cdef_strengths[1] = 0; + cdef_info->cdef_uv_strengths[1] = 0; + } const CommonModeInfoParams *const mi_params = &cm->mi_params; const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64; MB_MODE_INFO **mbmi = mi_params->mi_grid_base; for (int r = 0; r < nvfb; ++r) { for (int c = 0; c < nhfb; ++c) { - mbmi[MI_SIZE_64X64 * c]->cdef_strength = 0; + MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c]; + current_mbmi->cdef_strength = 0; + if (skip_cdef && current_mbmi->skip_cdef_curr_sb && + frames_since_key > 10) { + current_mbmi->cdef_strength = 1; + } } mbmi += MI_SIZE_64X64 * mi_params->mi_stride; } @@ -551,10 +567,10 @@ static void pick_cdef_from_qp(AV1_COMMON *const cm) { void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm, - MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, - int rdmult) { + MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult, + int skip_cdef_feature, int frames_since_key) { if (pick_method == CDEF_PICK_FROM_Q) { - pick_cdef_from_qp(cm); + pick_cdef_from_qp(cm, skip_cdef_feature, frames_since_key); return; } const CommonModeInfoParams *const mi_params = &cm->mi_params; diff --git a/third_party/libaom/source/libaom/av1/encoder/pickcdef.h b/third_party/libaom/source/libaom/av1/encoder/pickcdef.h index 7fe1edb695..6bea1b0945 100644 --- a/third_party/libaom/source/libaom/av1/encoder/pickcdef.h +++ b/third_party/libaom/source/libaom/av1/encoder/pickcdef.h @@ -58,20 +58,6 @@ typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src, BLOCK_SIZE bsize, int coeff_shift, int row, int col); -// Data related to CDEF search multi-thread synchronization. -typedef struct AV1CdefSyncData { -#if CONFIG_MULTITHREAD - // Mutex lock used while dispatching jobs. - pthread_mutex_t *mutex_; -#endif // CONFIG_MULTITHREAD - // Flag to indicate all blocks are processed and end of frame is reached - int end_of_frame; - // Row index in units of 64x64 block - int fbr; - // Column index in units of 64x64 block - int fbc; -} AV1CdefSync; - /*! \brief CDEF search context. */ typedef struct { @@ -224,6 +210,8 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc, * \param[in] xd Pointer to common current coding block structure * \param[in] pick_method The method used to select params * \param[in] rdmult rd multiplier to use in making param choices + * \param[in] skip_cdef_feature Speed feature to skip cdef + * \param[in] frames_since_key Number of frames since key frame * * \return Nothing is returned. Instead, optimal CDEF parameters are stored * in the \c cdef_info structure of type \ref CdefInfo inside \c cm: @@ -239,7 +227,8 @@ void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc, void av1_cdef_search(struct MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm, - MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult); + MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult, + int skip_cdef_feature, int frames_since_key); #ifdef __cplusplus } // extern "C" diff --git a/third_party/libaom/source/libaom/av1/encoder/picklpf.c b/third_party/libaom/source/libaom/av1/encoder/picklpf.c index 9b3924f5ce..44030767b5 100644 --- a/third_party/libaom/source/libaom/av1/encoder/picklpf.c +++ b/third_party/libaom/source/libaom/av1/encoder/picklpf.c @@ -39,8 +39,8 @@ static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, int av1_get_max_filter_level(const AV1_COMP *cpi) { if (is_stat_consumption_stage_twopass(cpi)) { - return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 - : MAX_LOOP_FILTER; + return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 + : MAX_LOOP_FILTER; } else { return MAX_LOOP_FILTER; } @@ -78,16 +78,16 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, 0, #endif mt_info->workers, num_workers, - &mt_info->lf_row_sync); + &mt_info->lf_row_sync, 0); else av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, #if CONFIG_LPF_MASK 0, #endif - plane, plane + 1, partial_frame); + plane, plane + 1, partial_frame, 0); filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane, - cm->seq_params.use_highbitdepth); + cm->seq_params->use_highbitdepth); // Re-instate the unfiltered frame yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane); @@ -153,8 +153,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; if ((is_stat_consumption_stage_twopass(cpi)) && - (cpi->twopass.section_intra_rating < 20)) - bias = (bias * cpi->twopass.section_intra_rating) / 20; + (cpi->ppi->twopass.section_intra_rating < 20)) + bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20; // yx, bias less for large block size if (cm->features.tx_mode != ONLY_4X4) bias >>= 1; @@ -205,7 +205,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, if (best_cost_ret) *best_cost_ret = RDCOST_DBL_WITH_NATIVE_BD_DIST( - x->rdmult, 0, (best_err << 4), cm->seq_params.bit_depth); + x->rdmult, 0, (best_err << 4), cm->seq_params->bit_depth); return filt_best; } @@ -226,7 +226,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, const int min_filter_level = 0; const int max_filter_level = av1_get_max_filter_level(cpi); const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); // based on tests result for rtc test set // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point const int strength_boost_q_treshold = 0; @@ -244,7 +244,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, // And high bit depth separately: // filt_guess = q * 0.316206 + 3.87252 int filt_guess; - switch (cm->seq_params.bit_depth) { + switch (cm->seq_params->bit_depth) { case AOM_BITS_8: filt_guess = (cm->current_frame.frame_type == KEY_FRAME) @@ -263,7 +263,7 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, "or AOM_BITS_12"); return; } - if (cm->seq_params.bit_depth != AOM_BITS_8 && + if (cm->seq_params->bit_depth != AOM_BITS_8 && cm->current_frame.frame_type == KEY_FRAME) filt_guess -= 4; // TODO(chengchen): retrain the model for Y, U, V filter levels @@ -272,10 +272,20 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level); lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level); } else { - const int last_frame_filter_level[4] = { lf->filter_level[0], - lf->filter_level[1], - lf->filter_level_u, - lf->filter_level_v }; + int last_frame_filter_level[4] = { 0 }; + if (!frame_is_intra_only(cm)) { +#if CONFIG_FRAME_PARALLEL_ENCODE + last_frame_filter_level[0] = cpi->ppi->filter_level[0]; + last_frame_filter_level[1] = cpi->ppi->filter_level[1]; + last_frame_filter_level[2] = cpi->ppi->filter_level_u; + last_frame_filter_level[3] = cpi->ppi->filter_level_v; +#else + last_frame_filter_level[0] = lf->filter_level[0]; + last_frame_filter_level[1] = lf->filter_level[1]; + last_frame_filter_level[2] = lf->filter_level_u; + last_frame_filter_level[3] = lf->filter_level_v; +#endif + } lf->filter_level[0] = lf->filter_level[1] = search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, @@ -297,5 +307,14 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, last_frame_filter_level, NULL, 2, 0); } +#if CONFIG_FRAME_PARALLEL_ENCODE + // Store current frame loopfilter levels if update flag is set. + if (cpi->do_frame_data_update) { + cpi->ppi->filter_level[0] = lf->filter_level[0]; + cpi->ppi->filter_level[1] = lf->filter_level[1]; + cpi->ppi->filter_level_u = lf->filter_level_u; + cpi->ppi->filter_level_v = lf->filter_level_v; + } +#endif } } diff --git a/third_party/libaom/source/libaom/av1/encoder/pickrst.c b/third_party/libaom/source/libaom/av1/encoder/pickrst.c index 21965138be..2c12cb014f 100644 --- a/third_party/libaom/source/libaom/av1/encoder/pickrst.c +++ b/third_party/libaom/source/libaom/av1/encoder/pickrst.c @@ -199,8 +199,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc, const int is_uv = plane > 0; const RestorationInfo *rsi = &cm->rst_info[plane]; RestorationLineBuffers rlbs; - const int bit_depth = cm->seq_params.bit_depth; - const int highbd = cm->seq_params.use_highbitdepth; + const int bit_depth = cm->seq_params->bit_depth; + const int highbd = cm->seq_params->use_highbitdepth; const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf; // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be @@ -209,8 +209,8 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc, av1_loop_restoration_filter_unit( limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0, - is_uv && cm->seq_params.subsampling_x, - is_uv && cm->seq_params.subsampling_y, highbd, bit_depth, + is_uv && cm->seq_params->subsampling_x, + is_uv && cm->seq_params->subsampling_y, highbd, bit_depth, fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane], rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr); @@ -886,8 +886,8 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits, const MACROBLOCK *const x = rsc->x; const AV1_COMMON *const cm = rsc->cm; - const int highbd = cm->seq_params.use_highbitdepth; - const int bit_depth = cm->seq_params.bit_depth; + const int highbd = cm->seq_params->use_highbitdepth; + const int bit_depth = cm->seq_params->bit_depth; const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0]; // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set @@ -905,8 +905,8 @@ static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits, rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start; const int is_uv = rsc->plane > 0; - const int ss_x = is_uv && cm->seq_params.subsampling_x; - const int ss_y = is_uv && cm->seq_params.subsampling_y; + const int ss_x = is_uv && cm->seq_params->subsampling_x; + const int ss_y = is_uv && cm->seq_params->subsampling_y; const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x; const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y; @@ -1474,12 +1474,12 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits, const int scale[3] = { 0, 1, 2 }; // Obtain the normalized Qscale const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0, - rsc->cm->seq_params.bit_depth) >> + rsc->cm->seq_params->bit_depth) >> 3; // Derive threshold as sqr(normalized Qscale) * scale / 16, const uint64_t thresh = (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4; - const int highbd = rsc->cm->seq_params.use_highbitdepth; + const int highbd = rsc->cm->seq_params->use_highbitdepth; const uint64_t src_var = var_restoration_unit(limits, rsc->src, rsc->plane, highbd); // Do not perform Wiener search if source variance is lower than threshold @@ -1510,11 +1510,11 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits, #if CONFIG_AV1_HIGHBITDEPTH const AV1_COMMON *const cm = rsc->cm; - if (cm->seq_params.use_highbitdepth) { + if (cm->seq_params->use_highbitdepth) { av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start, limits->h_end, limits->v_start, limits->v_end, rsc->dgd_stride, - rsc->src_stride, M, H, cm->seq_params.bit_depth); + rsc->src_stride, M, H, cm->seq_params->bit_depth); } else { av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer, limits->h_start, limits->h_end, limits->v_start, @@ -1567,10 +1567,10 @@ static AOM_INLINE void search_wiener(const RestorationTileLimits *limits, double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE], - rsc->cm->seq_params.bit_depth); + rsc->cm->seq_params->bit_depth); double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST( x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER], - rsc->cm->seq_params.bit_depth); + rsc->cm->seq_params->bit_depth); RestorationType rtype = (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE; @@ -1601,7 +1601,7 @@ static AOM_INLINE void search_norestore(const RestorationTileLimits *limits, RestSearchCtxt *rsc = (RestSearchCtxt *)priv; RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx]; - const int highbd = rsc->cm->seq_params.use_highbitdepth; + const int highbd = rsc->cm->seq_params->use_highbitdepth; rusi->sse[RESTORE_NONE] = sse_restoration_unit( limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd); @@ -1653,8 +1653,8 @@ static AOM_INLINE void search_switchable(const RestorationTileLimits *limits, } const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT; const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits; - double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(x->rdmult, bits >> 4, sse, - rsc->cm->seq_params.bit_depth); + double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST( + x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth); if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10) cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level); if (r == 0 || cost < best_cost) { @@ -1694,7 +1694,7 @@ static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) { av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc, &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL); return RDCOST_DBL_WITH_NATIVE_BD_DIST( - rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params.bit_depth); + rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params->bit_depth); } static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) { @@ -1740,7 +1740,7 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { double best_cost = 0; RestorationType best_rtype = RESTORE_NONE; - const int highbd = rsc.cm->seq_params.use_highbitdepth; + const int highbd = rsc.cm->seq_params->use_highbitdepth; if (!cpi->sf.lpf_sf.disable_loop_restoration_chroma || !plane) { av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height, rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER, diff --git a/third_party/libaom/source/libaom/av1/encoder/ratectrl.c b/third_party/libaom/source/libaom/av1/encoder/ratectrl.c index 33befa6147..c24c822b9b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/ratectrl.c +++ b/third_party/libaom/source/libaom/av1/encoder/ratectrl.c @@ -233,11 +233,12 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; + PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc; lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate) - encoded_frame_size; // Clip buffer level to maximum buffer size for the layer. lrc->bits_off_target = - AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + AOMMIN(lrc->bits_off_target, lp_rc->maximum_buffer_size); lrc->buffer_level = lrc->bits_off_target; } } @@ -245,6 +246,7 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { const AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; // Non-viewable frames are a special case and are treated as pure overhead. if (!cm->show_frame) @@ -253,10 +255,11 @@ static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; // Clip the buffer level to the maximum specified buffer size. - rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->bits_off_target = AOMMIN(rc->bits_off_target, p_rc->maximum_buffer_size); rc->buffer_level = rc->bits_off_target; - if (cpi->use_svc) update_layer_buffer_level(&cpi->svc, encoded_frame_size); + if (cpi->ppi->use_svc) + update_layer_buffer_level(&cpi->svc, encoded_frame_size); } int av1_rc_get_default_min_gf_interval(int width, int height, @@ -285,7 +288,24 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { return AOMMAX(interval, min_gf_interval); } -void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { +void av1_primary_rc_init(const AV1EncoderConfig *oxcf, + PRIMARY_RATE_CONTROL *p_rc) { + int min_gf_interval = oxcf->gf_cfg.min_gf_interval; + int max_gf_interval = oxcf->gf_cfg.max_gf_interval; + if (min_gf_interval == 0) + min_gf_interval = av1_rc_get_default_min_gf_interval( + oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, + oxcf->input_cfg.init_framerate); + if (max_gf_interval == 0) + max_gf_interval = av1_rc_get_default_max_gf_interval( + oxcf->input_cfg.init_framerate, min_gf_interval); + p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2; + p_rc->this_key_frame_forced = 0; + p_rc->next_key_frame_forced = 0; +} + +void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc, + const PRIMARY_RATE_CONTROL *const p_rc) { const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; int i; @@ -302,8 +322,8 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q; rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q; - rc->buffer_level = rc->starting_buffer_level; - rc->bits_off_target = rc->starting_buffer_level; + rc->buffer_level = p_rc->starting_buffer_level; + rc->bits_off_target = p_rc->starting_buffer_level; rc->rolling_target_bits = rc->avg_frame_bandwidth; rc->rolling_actual_bits = rc->avg_frame_bandwidth; @@ -312,8 +332,6 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->total_target_bits = 0; rc->frames_since_key = 8; // Sensible default for first frame. - rc->this_key_frame_forced = 0; - rc->next_key_frame_forced = 0; rc->frames_till_gf_update_due = 0; rc->ni_av_qi = rc_cfg->worst_allowed_q; @@ -337,7 +355,6 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { if (rc->max_gf_interval == 0) rc->max_gf_interval = av1_rc_get_default_max_gf_interval( oxcf->input_cfg.init_framerate, rc->min_gf_interval); - rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; rc->avg_frame_low_motion = 0; rc->resize_state = ORIG; @@ -349,6 +366,7 @@ void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { int av1_rc_drop_frame(AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; if (!oxcf->rc_cfg.drop_frames_water_mark) { return 0; @@ -360,7 +378,7 @@ int av1_rc_drop_frame(AV1_COMP *cpi) { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark * - rc->optimal_buffer_level / 100); + p_rc->optimal_buffer_level / 100); if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { --rc->decimation_factor; } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) { @@ -384,6 +402,7 @@ int av1_rc_drop_frame(AV1_COMP *cpi) { static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) { const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1_COMMON *const cm = &cpi->common; const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; const int max_delta = 16; @@ -397,7 +416,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) { (cm->width != cm->prev_frame->width || cm->height != cm->prev_frame->height || change_avg_frame_bandwidth); // Apply some control/clamp to QP under certain conditions. - if (cm->current_frame.frame_type != KEY_FRAME && !cpi->use_svc && + if (cm->current_frame.frame_type != KEY_FRAME && !cpi->ppi->use_svc && rc->frames_since_key > 1 && !change_target_bits_mb && (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct || !(refresh_frame_flags->alt_ref_frame || @@ -411,7 +430,7 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) { // Adjust Q base on source content change from scene detection. if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 && rc->frames_since_key > 10) { - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; double delta = (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0; // Push Q downwards if content change is decreasing and buffer level @@ -419,14 +438,14 @@ static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) { // only for high Q to avoid excess overshoot. // Else reduce decrease in Q from previous frame if content change is // increasing and buffer is below max (so not undershooting). - if (delta < 0.0 && rc->buffer_level > (rc->optimal_buffer_level >> 2) && + if (delta < 0.0 && rc->buffer_level > (p_rc->optimal_buffer_level >> 2) && q > (rc->worst_quality >> 1)) { double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta); double q_val = av1_convert_qindex_to_q(q, bit_depth); q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); } else if (rc->q_1_frame - q > 0 && delta > 0.1 && - rc->buffer_level < AOMMIN(rc->maximum_buffer_size, - rc->optimal_buffer_level << 1)) { + rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size, + p_rc->optimal_buffer_level << 1)) { q = (3 * q + rc->q_1_frame) >> 2; } } @@ -452,8 +471,9 @@ static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = { GF_ARF_LOW, // INTNL_ARF_UPDATE }; -static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group) { - const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; +static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group, + int gf_frame_index) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index]; assert(update_type < FRAME_UPDATE_TYPES); return rate_factor_levels[update_type]; } @@ -480,12 +500,13 @@ static double get_rate_correction_factor(const AV1_COMP *cpi, int width, if (cpi->common.current_frame.frame_type == KEY_FRAME) { rcf = rc->rate_correction_factors[KF_STD]; } else if (is_stat_consumption_stage(cpi)) { - const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group); + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index); rcf = rc->rate_correction_factors[rf_lvl]; } else { if ((refresh_frame_flags->alt_ref_frame || refresh_frame_flags->golden_frame) && - !rc->is_src_frame_alt_ref && !cpi->use_svc && + !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && (cpi->oxcf.rc_cfg.mode != AOM_CBR || cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) rcf = rc->rate_correction_factors[GF_ARF_STD]; @@ -524,12 +545,13 @@ static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width, if (cpi->common.current_frame.frame_type == KEY_FRAME) { rc->rate_correction_factors[KF_STD] = factor; } else if (is_stat_consumption_stage(cpi)) { - const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group); + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index); rc->rate_correction_factors[rf_lvl] = factor; } else { if ((refresh_frame_flags->alt_ref_frame || refresh_frame_flags->golden_frame) && - !rc->is_src_frame_alt_ref && !cpi->use_svc && + !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && (cpi->oxcf.rc_cfg.mode != AOM_CBR || cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) rc->rate_correction_factors[GF_ARF_STD] = factor; @@ -564,7 +586,7 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width, } else { projected_size_based_on_q = av1_estimate_bits_at_q( cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs, - rate_correction_factor, cm->seq_params.bit_depth, + rate_correction_factor, cm->seq_params->bit_depth, cpi->is_screen_content_type); } // Work out a size correction factor. @@ -620,7 +642,7 @@ static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh, return use_cyclic_refresh ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor) : av1_rc_bits_per_mb(cm->current_frame.frame_type, q, - correction_factor, cm->seq_params.bit_depth, + correction_factor, cm->seq_params->bit_depth, cpi->is_screen_content_type); } @@ -724,26 +746,31 @@ static int get_active_quality(int q, int gfu_boost, int low, int high, } } -static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, +static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q, aom_bit_depth_t bit_depth) { int *kf_low_motion_minq; int *kf_high_motion_minq; ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq); ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq); - return get_active_quality(q, rc->kf_boost, kf_low, kf_high, + return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high, kf_low_motion_minq, kf_high_motion_minq); } -static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, - aom_bit_depth_t bit_depth) { +static int get_gf_active_quality_no_rc(int gfu_boost, int q, + aom_bit_depth_t bit_depth) { int *arfgf_low_motion_minq; int *arfgf_high_motion_minq; ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); - return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + return get_active_quality(q, gfu_boost, gf_low, gf_high, arfgf_low_motion_minq, arfgf_high_motion_minq); } +static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q, + aom_bit_depth_t bit_depth) { + return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth); +} + static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) { int *arfgf_high_motion_minq; ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); @@ -782,8 +809,9 @@ static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) { // (at buffer = critical level). const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; // Buffer level below which we push active_worst to worst_quality. - int64_t critical_level = rc->optimal_buffer_level >> 3; + int64_t critical_level = p_rc->optimal_buffer_level >> 3; int64_t buff_lvl_step = 0; int adjustment = 0; int active_worst_quality; @@ -799,25 +827,26 @@ static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) { rc->avg_frame_qindex[KEY_FRAME]) : rc->avg_frame_qindex[INTER_FRAME]; active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4); - if (rc->buffer_level > rc->optimal_buffer_level) { + if (rc->buffer_level > p_rc->optimal_buffer_level) { // Adjust down. // Maximum limit for down adjustment, ~30%. int max_adjustment_down = active_worst_quality / 3; if (max_adjustment_down) { - buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) / - max_adjustment_down); + buff_lvl_step = + ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) / + max_adjustment_down); if (buff_lvl_step) - adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) / + adjustment = (int)((rc->buffer_level - p_rc->optimal_buffer_level) / buff_lvl_step); active_worst_quality -= adjustment; } } else if (rc->buffer_level > critical_level) { // Adjust up from ambient Q. if (critical_level) { - buff_lvl_step = (rc->optimal_buffer_level - critical_level); + buff_lvl_step = (p_rc->optimal_buffer_level - critical_level); if (buff_lvl_step) { adjustment = (int)((rc->worst_quality - ambient_qp) * - (rc->optimal_buffer_level - rc->buffer_level) / + (p_rc->optimal_buffer_level - rc->buffer_level) / buff_lvl_step); } active_worst_quality = ambient_qp + adjustment; @@ -835,10 +864,11 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi, int width, int height) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; const CurrentFrame *const current_frame = &cm->current_frame; int *rtc_minq; - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; int active_best_quality = rc->best_quality; ASSIGN_MINQ_TABLE(bit_depth, rtc_minq); @@ -846,7 +876,7 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi, // Handle the special case for key frames forced when we have reached // the maximum key frame interval. Here force the Q to a range // based on the ambient Q to reduce the risk of popping. - if (rc->this_key_frame_forced) { + if (p_rc->this_key_frame_forced) { int qindex = rc->last_boosted_qindex; double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); int delta_qindex = av1_compute_qdelta(rc, last_boosted_q, @@ -856,8 +886,8 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi, // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; double q_val; - active_best_quality = - get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth); + active_best_quality = get_kf_active_quality( + p_rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth); // Allow somewhat lower kf minq with small image formats. if ((width * height) <= (352 * 288)) { q_adj_factor -= 0.25; @@ -868,7 +898,7 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi, active_best_quality += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth); } - } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc && + } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc && cpi->oxcf.rc_cfg.gf_cbr_boost_pct && (refresh_frame_flags->golden_frame || refresh_frame_flags->alt_ref_frame)) { @@ -880,7 +910,7 @@ static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi, rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { q = rc->avg_frame_qindex[INTER_FRAME]; } - active_best_quality = get_gf_active_quality(rc, q, bit_depth); + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); } else { // Use the lower of active_worst_quality and recent/average Q. FRAME_TYPE frame_type = @@ -913,9 +943,10 @@ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width, int *top_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const CurrentFrame *const current_frame = &cm->current_frame; int q; - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi); int active_best_quality = calc_active_best_quality_no_stats_cbr( cpi, active_worst_quality, width, height); @@ -932,7 +963,7 @@ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width, *bottom_index = active_best_quality; // Limit Q range for the adaptive loop. - if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced && + if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced && current_frame->frame_number != 0) { int qdelta = 0; aom_clear_system_state(); @@ -944,7 +975,7 @@ static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width, } // Special case code to try and match quality with forced key frames - if (current_frame->frame_type == KEY_FRAME && rc->this_key_frame_forced) { + if (current_frame->frame_type == KEY_FRAME && p_rc->this_key_frame_forced) { q = rc->last_boosted_qindex; } else { q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, @@ -1018,7 +1049,7 @@ static int get_active_cq_level(const RATE_CONTROL *rc, * \c oxcf->cq_level, or slightly modified for some * special cases) * \param[in] bit_depth Bit depth of the codec (same as - * \c cm->seq_params.bit_depth) + * \c cm->seq_params->bit_depth) * \return Returns selected q index to be used for encoding this frame. */ static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf, @@ -1037,13 +1068,16 @@ static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf, return cq_level; } offset_idx = 0; - } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE) { - offset_idx = 1; - } else if (update_type == INTNL_ARF_UPDATE) { - offset_idx = - AOMMIN(gf_group->layer_depth[gf_index], FIXED_QP_OFFSET_COUNT - 1); - } else { // Leaf level / overlay frame. - assert(update_type == LF_UPDATE || update_type == OVERLAY_UPDATE || + } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE || + update_type == INTNL_ARF_UPDATE || update_type == LF_UPDATE) { + if (gf_group->layer_depth[gf_index] >= + gf_group->max_layer_depth_allowed + 1) { // Leaf. + return cq_level; // Directly Return worst quality allowed. + } + offset_idx = AOMMIN(gf_group->layer_depth[gf_index], + gf_group->max_layer_depth_allowed); + } else { // Overlay frame. + assert(update_type == OVERLAY_UPDATE || update_type == INTNL_OVERLAY_UPDATE); return cq_level; // Directly Return worst quality allowed. } @@ -1081,10 +1115,11 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, int *bottom_index, int *top_index) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const CurrentFrame *const current_frame = &cm->current_frame; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode; assert(has_no_stats_stage(cpi)); @@ -1097,7 +1132,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, cm->superres_scale_denominator); - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; if (oxcf->q_cfg.use_fixed_qp_offsets) { return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_index, cq_level, @@ -1117,7 +1152,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, const int delta_qindex = av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); - } else if (rc->this_key_frame_forced) { + } else if (p_rc->this_key_frame_forced) { const int qindex = rc->last_boosted_qindex; const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); const int delta_qindex = av1_compute_qdelta( @@ -1126,8 +1161,8 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, } else { // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; - active_best_quality = - get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth); + active_best_quality = get_kf_active_quality( + p_rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth); // Allow somewhat lower kf minq with small image formats. if ((width * height) <= (352 * 288)) { @@ -1148,14 +1183,29 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. + int avg_frame_qindex_inter_frame; + int avg_frame_qindex_key_frame; +#if CONFIG_FRAME_PARALLEL_ENCODE + avg_frame_qindex_inter_frame = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? cpi->ppi->temp_avg_frame_qindex[INTER_FRAME] + : cpi->rc.avg_frame_qindex[INTER_FRAME]; + avg_frame_qindex_key_frame = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? cpi->ppi->temp_avg_frame_qindex[KEY_FRAME] + : cpi->rc.avg_frame_qindex[KEY_FRAME]; +#else + avg_frame_qindex_inter_frame = rc->avg_frame_qindex[INTER_FRAME]; + avg_frame_qindex_key_frame = rc->avg_frame_qindex[KEY_FRAME]; +#endif q = (rc->frames_since_key > 1 && - rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) - ? rc->avg_frame_qindex[INTER_FRAME] - : rc->avg_frame_qindex[KEY_FRAME]; + avg_frame_qindex_inter_frame < active_worst_quality) + ? avg_frame_qindex_inter_frame + : avg_frame_qindex_key_frame; // For constrained quality dont allow Q less than the cq level if (rc_mode == AOM_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, bit_depth); + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; } else if (rc_mode == AOM_Q) { @@ -1167,7 +1217,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } else { - active_best_quality = get_gf_active_quality(rc, q, bit_depth); + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); } } else { if (rc_mode == AOM_Q) { @@ -1206,8 +1256,8 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, { int qdelta = 0; aom_clear_system_state(); - if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced && - current_frame->frame_number != 0) { + if (current_frame->frame_type == KEY_FRAME && + !p_rc->this_key_frame_forced && current_frame->frame_number != 0) { qdelta = av1_compute_qdelta_by_rate( &cpi->rc, current_frame->frame_type, active_worst_quality, 2.0, cpi->is_screen_content_type, bit_depth); @@ -1226,7 +1276,7 @@ static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width, q = active_best_quality; // Special case code to try and match quality with forced key frames } else if ((current_frame->frame_type == KEY_FRAME) && - rc->this_key_frame_forced) { + p_rc->this_key_frame_forced) { q = rc->last_boosted_qindex; } else { q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, @@ -1251,16 +1301,17 @@ static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75, 1.50, 1.25, 1.15, 1.0 }; int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) { - const GF_GROUP *const gf_group = &cpi->gf_group; - const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(gf_group); - const FRAME_TYPE frame_type = gf_group->frame_type[gf_group->index]; - const int arf_layer = AOMMIN(gf_group->layer_depth[gf_group->index], 6); + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(gf_group, cpi->gf_frame_index); + const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; + const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); const double rate_factor = (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer]; return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor, cpi->is_screen_content_type, - cpi->common.seq_params.bit_depth); + cpi->common.seq_params->bit_depth); } // This unrestricted Q selection on CQ mode is useful when testing new features, @@ -1275,7 +1326,7 @@ static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width, const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, cm->superres_scale_denominator); - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth); (void)width; (void)height; @@ -1295,10 +1346,11 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height, int cq_level, int is_fwd_kf) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; int active_best_quality; int active_worst_quality = *active_worst; - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) { // If the next frame is also a key frame or the current frame is the @@ -1315,7 +1367,7 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height, const int delta_qindex = av1_compute_qdelta( rc, last_boosted_q, last_boosted_q * 0.25, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); - } else if (rc->this_key_frame_forced) { + } else if (p_rc->this_key_frame_forced) { // Handle the special case for key frames forced when we have reached // the maximum key frame interval. Here force the Q to a range // based on the ambient Q to reduce the risk of popping. @@ -1324,8 +1376,8 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height, int qindex; if (is_stat_consumption_stage_twopass(cpi) && - cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { - qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = AOMMIN(p_rc->last_kf_qindex, rc->last_boosted_qindex); active_best_quality = qindex; last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); delta_qindex = av1_compute_qdelta(rc, last_boosted_q, @@ -1346,13 +1398,13 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height, // Baseline value derived from cpi->active_worst_quality and kf boost. active_best_quality = - get_kf_active_quality(rc, active_worst_quality, bit_depth); + get_kf_active_quality(p_rc, active_worst_quality, bit_depth); if (cpi->is_screen_content_type) { active_best_quality /= 2; } if (is_stat_consumption_stage_twopass(cpi) && - cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { active_best_quality /= 3; } @@ -1363,7 +1415,8 @@ static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height, // Make a further adjustment based on the kf zero motion measure. if (is_stat_consumption_stage_twopass(cpi)) - q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + q_adj_factor += + 0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct); // Convert the adjustment factor to a qindex delta // on active_best_quality. @@ -1394,8 +1447,9 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi, int *active_best) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; - const int bit_depth = cpi->common.seq_params.bit_depth; + const int bit_depth = cpi->common.seq_params->bit_depth; int active_best_quality = *active_best; int active_worst_quality = *active_worst; // Extension to max or min Q if undershoot or overshoot is outside @@ -1406,20 +1460,21 @@ static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi, (refresh_frame_flags->golden_frame || is_intrl_arf_boost || refresh_frame_flags->alt_ref_frame))) { active_best_quality -= - (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); - active_worst_quality += (cpi->twopass.extend_maxq / 2); + (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast); + active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2); } else { active_best_quality -= - (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; - active_worst_quality += cpi->twopass.extend_maxq; + (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast) / + 2; + active_worst_quality += cpi->ppi->twopass.extend_maxq; } } aom_clear_system_state(); #ifndef STRICT_RC // Static forced key frames Q restrictions dealt with elsewhere. - if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced || - (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { + if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced || + (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality); active_worst_quality = AOMMAX(active_worst_quality + qdelta, active_best_quality); @@ -1464,18 +1519,19 @@ static int get_q(const AV1_COMP *cpi, const int width, const int height, const int active_best_quality) { const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int q; if (cpi->oxcf.rc_cfg.mode == AOM_Q || - (frame_is_intra_only(cm) && !rc->this_key_frame_forced && - cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH && + (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced && + cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH && rc->frames_to_key > 1)) { q = active_best_quality; // Special case code to try and match quality with forced key frames. - } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { + } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) { // If static since last kf use better of last boosted and last kf q. - if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { - q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + q = AOMMIN(p_rc->last_kf_qindex, rc->last_boosted_qindex); } else { q = AOMMIN(rc->last_boosted_qindex, (active_best_quality + active_worst_quality) / 2); @@ -1504,20 +1560,29 @@ static int get_active_best_quality(const AV1_COMP *const cpi, const int active_worst_quality, const int cq_level, const int gf_index) { const AV1_COMMON *const cm = &cpi->common; - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; - const GF_GROUP *gf_group = &cpi->gf_group; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode; int *inter_minq; + int avg_frame_qindex_inter_frame; ASSIGN_MINQ_TABLE(bit_depth, inter_minq); int active_best_quality = 0; const int is_intrl_arf_boost = gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; - const int is_leaf_frame = - !(refresh_frame_flags->golden_frame || - refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost); + int is_leaf_frame = + !(gf_group->update_type[gf_index] == ARF_UPDATE || + gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost); + + // TODO(jingning): Consider to rework this hack that covers issues incurred + // in lightfield setting. + if (cm->tiles.large_scale) { + is_leaf_frame = !(refresh_frame_flags->golden_frame || + refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost); + } const int is_overlay_frame = rc->is_src_frame_alt_ref; if (is_leaf_frame || is_overlay_frame) { @@ -1532,31 +1597,35 @@ static int get_active_best_quality(const AV1_COMP *const cpi, return active_best_quality; } - // TODO(chengchen): can we remove this condition? - if (rc_mode == AOM_Q && !refresh_frame_flags->alt_ref_frame && - !refresh_frame_flags->golden_frame && !is_intrl_arf_boost) { - return cq_level; - } - // Determine active_best_quality for frames that are not leaf or overlay. int q = active_worst_quality; +#if CONFIG_FRAME_PARALLEL_ENCODE + // For quality simulation purpose - for parallel frames use previous + // avg_frame_qindex + avg_frame_qindex_inter_frame = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? cpi->ppi->temp_avg_frame_qindex[INTER_FRAME] + : rc->avg_frame_qindex[INTER_FRAME]; +#else + avg_frame_qindex_inter_frame = rc->avg_frame_qindex[INTER_FRAME]; +#endif // CONFIG_FRAME_PARALLEL_ENCODE // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. if (rc->frames_since_key > 1 && - rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { - q = rc->avg_frame_qindex[INTER_FRAME]; + avg_frame_qindex_inter_frame < active_worst_quality) { + q = avg_frame_qindex_inter_frame; } if (rc_mode == AOM_CQ && q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, bit_depth); + active_best_quality = get_gf_active_quality(p_rc, q, bit_depth); // Constrained quality use slightly lower active best. if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16; const int min_boost = get_gf_high_motion_quality(q, bit_depth); const int boost = min_boost - active_best_quality; - active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); + active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor); if (!is_intrl_arf_boost) return active_best_quality; - if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = rc->arf_q; + if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q; int this_height = gf_group_pyramid_level(gf_group, gf_index); while (this_height > 1) { active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; @@ -1565,6 +1634,87 @@ static int get_active_best_quality(const AV1_COMP *const cpi, return active_best_quality; } +// Returns the q_index for a single frame in the GOP. +// This function assumes that rc_mode == AOM_Q mode. +int av1_q_mode_get_q_index(int base_q_index, const GF_GROUP *gf_group, + const int gf_index, int arf_q) { + const int is_intrl_arf_boost = + gf_group->update_type[gf_index] == INTNL_ARF_UPDATE; + int is_leaf_or_overlay_frame = + gf_group->update_type[gf_index] == LF_UPDATE || + gf_group->update_type[gf_index] == OVERLAY_UPDATE || + gf_group->update_type[gf_index] == INTNL_OVERLAY_UPDATE; + + if (is_leaf_or_overlay_frame) return base_q_index; + + if (!is_intrl_arf_boost) return arf_q; + + int active_best_quality = arf_q; + int active_worst_quality = base_q_index; + int this_height = gf_group_pyramid_level(gf_group, gf_index); + while (this_height > 1) { + active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; + --this_height; + } + return active_best_quality; +} + +// Returns the q_index for the ARF in the GOP. +int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth, + int arf_boost_factor) { + int active_best_quality = + get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth); + const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth); + const int boost = min_boost - active_best_quality; + return min_boost - (int)(boost * arf_boost_factor); +} + +static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width, + int height, int gf_index, + int *bottom_index, int *top_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = + get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, + cm->superres_scale_denominator); + int active_best_quality = 0; + int active_worst_quality = rc->active_worst_quality; + int q; + + if (frame_is_intra_only(cm)) { + const int is_fwd_kf = cm->current_frame.frame_type == KEY_FRAME && + cm->show_frame == 0 && cpi->no_show_fwd_kf; + get_intra_q_and_bounds(cpi, width, height, &active_best_quality, + &active_worst_quality, cq_level, is_fwd_kf); + } else { + // Active best quality limited by previous layer. + active_best_quality = + get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index); + } + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + + *top_index = AOMMAX(*top_index, rc->best_quality); + *top_index = AOMMIN(*top_index, rc->worst_quality); + + *bottom_index = AOMMAX(*bottom_index, rc->best_quality); + *bottom_index = AOMMIN(*bottom_index, rc->worst_quality); + + q = active_best_quality; + + q = AOMMAX(q, rc->best_quality); + q = AOMMIN(q, rc->worst_quality); + + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + + return q; +} + /*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc. * * Handles the the general cases not covered by @@ -1587,20 +1737,25 @@ static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, const RATE_CONTROL *const rc = &cpi->rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; - const GF_GROUP *gf_group = &cpi->gf_group; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; assert(IMPLIES(has_no_stats_stage(cpi), cpi->oxcf.rc_cfg.mode == AOM_Q && gf_group->update_type[gf_index] != ARF_UPDATE)); const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode, cm->superres_scale_denominator); - const int bit_depth = cm->seq_params.bit_depth; + const int bit_depth = cm->seq_params->bit_depth; if (oxcf->q_cfg.use_fixed_qp_offsets) { - return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_group->index, + return get_q_using_fixed_offsets(oxcf, rc, gf_group, cpi->gf_frame_index, cq_level, bit_depth); } + if (oxcf->rc_cfg.mode == AOM_Q) { + return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index, + bottom_index, top_index); + } + int active_best_quality = 0; int active_worst_quality = rc->active_worst_quality; int q; @@ -1620,8 +1775,7 @@ static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, // Active best quality limited by previous layer. const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index); - if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS) || - (oxcf->rc_cfg.mode == AOM_Q)) { + if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) { active_best_quality = get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index); } else { @@ -1668,13 +1822,13 @@ static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, return q; } -int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width, - int height, int gf_index, int *bottom_index, - int *top_index) { +int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height, + int gf_index, int *bottom_index, int *top_index) { + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int q; // TODO(sarahparker) merge no-stats vbr and altref q computation // with rc_pick_q_and_bounds(). - const GF_GROUP *gf_group = &cpi->gf_group; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; if ((cpi->oxcf.rc_cfg.mode != AOM_Q || gf_group->update_type[gf_index] == ARF_UPDATE) && has_no_stats_stage(cpi)) { @@ -1694,7 +1848,7 @@ int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width, q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index, top_index); } - if (gf_group->update_type[gf_index] == ARF_UPDATE) rc->arf_q = q; + if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q; return q; } @@ -1756,11 +1910,12 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { const AV1_COMMON *const cm = &cpi->common; const CurrentFrame *const current_frame = &cm->current_frame; RATE_CONTROL *const rc = &cpi->rc; - const GF_GROUP *const gf_group = &cpi->gf_group; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame; const int is_intrnl_arf = - gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE; + gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; const int qindex = cm->quant_params.base_qindex; @@ -1776,7 +1931,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); } else { - if ((cpi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) || + if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) || (!rc->is_src_frame_alt_ref && !(refresh_frame_flags->golden_frame || is_intrnl_arf || refresh_frame_flags->alt_ref_frame))) { @@ -1784,7 +1939,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); rc->ni_frames++; - rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth); + rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth); rc->avg_q = rc->tot_q / rc->ni_frames; // Calculate the average Q for normal inter frames (not key or GFU // frames). @@ -1792,7 +1947,23 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; } } - +#if CONFIG_FRAME_PARALLEL_ENCODE + /* TODO(FPMT): The current update is happening in cpi->rc.avg_frame_qindex, + * this need to be taken care appropriately in final FPMT implementation + * to carry these values to subsequent frames. The avg_frame_qindex update + * is accumulated across frames, so the values from all individual parallel + * frames need to be taken into account after all the parallel frames are + * encoded. + * + * The variable temp_avg_frame_qindex introduced only for quality simulation + * purpose, it retains the value previous to the parallel encode frames. The + * variable is updated based on the update flag. + */ + if (cpi->do_frame_data_update && !rc->is_src_frame_alt_ref) { + for (int index = 0; index < FRAME_TYPES; index++) + cpi->ppi->temp_avg_frame_qindex[index] = rc->avg_frame_qindex[index]; + } +#endif // CONFIG_FRAME_PARALLEL_ENCODE // Keep record of last boosted (KF/GF/ARF) Q value. // If the current frame is coded at a lower Q then we also update it. // If all mbs in this group are skipped only update if the Q value is @@ -1800,12 +1971,12 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { // This is used to help set quality in forced key frames to reduce popping if ((qindex < rc->last_boosted_qindex) || (current_frame->frame_type == KEY_FRAME) || - (!rc->constrained_gf_group && + (!p_rc->constrained_gf_group && (refresh_frame_flags->alt_ref_frame || is_intrnl_arf || (refresh_frame_flags->golden_frame && !rc->is_src_frame_alt_ref)))) { rc->last_boosted_qindex = qindex; } - if (current_frame->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex; + if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex; update_buffer_level(cpi, rc->projected_frame_size); rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth; @@ -1853,6 +2024,7 @@ void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) { cpi->rc.frames_to_key--; cpi->rc.rc_2_frame = 0; cpi->rc.rc_1_frame = 0; + cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; } int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, @@ -1954,7 +2126,7 @@ void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi, * The no.of.stats available in the case of LAP is limited, * hence setting to max_gf_interval. */ - if (cpi->lap_enabled) + if (cpi->ppi->lap_enabled) rc->static_scene_max_gf_interval = rc->max_gf_interval + 1; else rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; @@ -2003,8 +2175,8 @@ static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) { RATE_CONTROL *const rc = &cpi->rc; int64_t vbr_bits_off_target = rc->vbr_bits_off_target; const int stats_count = - cpi->twopass.stats_buf_ctx->total_stats != NULL - ? (int)cpi->twopass.stats_buf_ctx->total_stats->count + cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL + ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count : 0; const int frame_window = AOMMIN( 16, (int)(stats_count - (int)cpi->common.current_frame.frame_number)); @@ -2048,16 +2220,17 @@ int av1_calc_pframe_target_size_one_pass_vbr( const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) { static const int af_ratio = 10; const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; int64_t target; #if USE_ALTREF_FOR_ONE_PASS if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE || frame_update_type == ARF_UPDATE) { - target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval * + target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * af_ratio) / - (rc->baseline_gf_interval + af_ratio - 1); + (p_rc->baseline_gf_interval + af_ratio - 1); } else { - target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) / - (rc->baseline_gf_interval + af_ratio - 1); + target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) / + (p_rc->baseline_gf_interval + af_ratio - 1); } if (target > INT_MAX) target = INT_MAX; #else @@ -2077,9 +2250,10 @@ int av1_calc_pframe_target_size_one_pass_cbr( const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) { const AV1EncoderConfig *oxcf = &cpi->oxcf; const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; const RateControlCfg *rc_cfg = &oxcf->rc_cfg; - const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; - const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; + const int64_t diff = p_rc->optimal_buffer_level - rc->buffer_level; + const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100; int min_frame_target = AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); int target; @@ -2087,17 +2261,17 @@ int av1_calc_pframe_target_size_one_pass_cbr( if (rc_cfg->gf_cbr_boost_pct) { const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100; if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) { - target = - (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) / - (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * + af_ratio_pct) / + (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100); } else { - target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) / - (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) / + (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100); } } else { target = rc->avg_frame_bandwidth; } - if (cpi->use_svc) { + if (cpi->ppi->use_svc) { // Note that for layers, avg_frame_bandwidth is the cumulative // per-frame-bandwidth. For the target size of this frame, use the // layer average frame size (i.e., non-cumulative per-frame-bw). @@ -2129,11 +2303,12 @@ int av1_calc_pframe_target_size_one_pass_cbr( int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { const RATE_CONTROL *rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc; int target; if (cpi->common.current_frame.frame_number == 0) { - target = ((rc->starting_buffer_level / 2) > INT_MAX) + target = ((p_rc->starting_buffer_level / 2) > INT_MAX) ? INT_MAX - : (int)(rc->starting_buffer_level / 2); + : (int)(p_rc->starting_buffer_level / 2); } else { int kf_boost = 32; double framerate = cpi->framerate; @@ -2177,7 +2352,7 @@ void av1_set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) { int gld_idx = 0; int alt_ref_idx = 0; ext_refresh_frame_flags->update_pending = 1; - svc->external_ref_frame_config = 1; + svc->set_ref_frame_config = 1; ext_flags->ref_frame_flags = 0; ext_refresh_frame_flags->last_frame = 1; ext_refresh_frame_flags->golden_frame = 0; @@ -2268,9 +2443,9 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi) { int num_samples = 0; const int thresh = 6; // SAD is computed on 64x64 blocks - const int sb_size_by_mb = (cm->seq_params.sb_size == BLOCK_128X128) - ? (cm->seq_params.mib_size >> 1) - : cm->seq_params.mib_size; + const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128) + ? (cm->seq_params->mib_size >> 1) + : cm->seq_params->mib_size; const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb; const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb; uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 @@ -2286,12 +2461,12 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi) { (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) && ((sbi_row % 2 == 0 && sbi_col % 2 == 0) || (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) { - tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, - last_src_ystride); + tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); if (check_light_change) { unsigned int sse, variance; - variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y, - last_src_ystride, &sse); + variance = cpi->ppi->fn_ptr[bsize].vf( + src_y, src_ystride, last_src_y, last_src_ystride, &sse); // Note: sse - variance = ((sum * sum) >> 12) // Detect large lighting change. if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) { @@ -2344,7 +2519,8 @@ static void rc_scene_detection_onepass_rt(AV1_COMP *cpi) { static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi, FRAME_TYPE frame_type) { RATE_CONTROL *const rc = &cpi->rc; - GF_GROUP *const gf_group = &cpi->gf_group; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; ResizePendingParams *const resize_pending_params = &cpi->resize_pending_params; int gf_update = 0; @@ -2360,34 +2536,34 @@ static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi, if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_set_golden_update(cpi); else - rc->baseline_gf_interval = MAX_GF_INTERVAL; - if (rc->baseline_gf_interval > rc->frames_to_key) - rc->baseline_gf_interval = rc->frames_to_key; - rc->gfu_boost = DEFAULT_GF_BOOST_RT; - rc->constrained_gf_group = - (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; - rc->frames_till_gf_update_due = rc->baseline_gf_interval; - gf_group->index = 0; + p_rc->baseline_gf_interval = MAX_GF_INTERVAL; + if (p_rc->baseline_gf_interval > rc->frames_to_key) + p_rc->baseline_gf_interval = rc->frames_to_key; + p_rc->gfu_boost = DEFAULT_GF_BOOST_RT; + p_rc->constrained_gf_group = + (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0; + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; + cpi->gf_frame_index = 0; // SVC does not use GF as periodic boost. // TODO(marpan): Find better way to disable this for SVC. - if (cpi->use_svc) { + if (cpi->ppi->use_svc) { SVC *const svc = &cpi->svc; - rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1; - rc->gfu_boost = 1; - rc->constrained_gf_group = 0; - rc->frames_till_gf_update_due = rc->baseline_gf_interval; + p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1; + p_rc->gfu_boost = 1; + p_rc->constrained_gf_group = 0; + rc->frames_till_gf_update_due = p_rc->baseline_gf_interval; for (int layer = 0; layer < svc->number_spatial_layers * svc->number_temporal_layers; ++layer) { LAYER_CONTEXT *const lc = &svc->layer_context[layer]; - lc->rc.baseline_gf_interval = rc->baseline_gf_interval; - lc->rc.gfu_boost = rc->gfu_boost; - lc->rc.constrained_gf_group = rc->constrained_gf_group; + lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval; + lc->p_rc.gfu_boost = p_rc->gfu_boost; + lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group; lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due; lc->group_index = 0; } } - gf_group->size = rc->baseline_gf_interval; + gf_group->size = p_rc->baseline_gf_interval; gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE; gf_update = 1; @@ -2398,6 +2574,7 @@ static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi, static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height, int prev_width, int prev_height) { RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; SVC *const svc = &cpi->svc; double tot_scale_change = 1.0; int target_bits_per_frame; @@ -2406,8 +2583,8 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height, tot_scale_change = (double)(resize_width * resize_height) / (double)(prev_width * prev_height); // Reset buffer level to optimal, update target size. - rc->buffer_level = rc->optimal_buffer_level; - rc->bits_off_target = rc->optimal_buffer_level; + rc->buffer_level = p_rc->optimal_buffer_level; + rc->bits_off_target = p_rc->optimal_buffer_level; rc->this_frame_target = av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME); target_bits_per_frame = rc->this_frame_target; @@ -2431,8 +2608,8 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height, svc->number_temporal_layers + tl]; lc->rc.resize_state = rc->resize_state; - lc->rc.buffer_level = lc->rc.optimal_buffer_level; - lc->rc.bits_off_target = lc->rc.optimal_buffer_level; + lc->rc.buffer_level = lc->p_rc.optimal_buffer_level; + lc->rc.bits_off_target = lc->p_rc.optimal_buffer_level; lc->rc.rate_correction_factors[INTER_FRAME] = rc->rate_correction_factors[INTER_FRAME]; } @@ -2464,6 +2641,7 @@ static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height, static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; RESIZE_ACTION resize_action = NO_RESIZE; const int avg_qp_thr1 = 70; const int avg_qp_thr2 = 50; @@ -2486,7 +2664,7 @@ static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) { if (cpi->rc.frames_since_key > cpi->framerate) { const int window = AOMMIN(30, (int)(2 * cpi->framerate)); rc->resize_avg_qp += rc->last_q[INTER_FRAME]; - if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100)) + if (cpi->rc.buffer_level < (int)(30 * p_rc->optimal_buffer_level / 100)) ++rc->resize_buffer_underflow; ++rc->resize_count; // Check for resize action every "window" frames. @@ -2548,8 +2726,9 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, EncodeFrameParams *const frame_params, unsigned int frame_flags) { RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; AV1_COMMON *const cm = &cpi->common; - GF_GROUP *const gf_group = &cpi->gf_group; + GF_GROUP *const gf_group = &cpi->ppi->gf_group; SVC *const svc = &cpi->svc; ResizePendingParams *const resize_pending_params = &cpi->resize_pending_params; @@ -2559,35 +2738,35 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, svc->number_temporal_layers); // Turn this on to explicitly set the reference structure rather than // relying on internal/default structure. - if (cpi->use_svc) { + if (cpi->ppi->use_svc) { av1_update_temporal_layer_framerate(cpi); av1_restore_layer_context(cpi); } // Set frame type. - if ((!cpi->use_svc && rc->frames_to_key == 0) || - (cpi->use_svc && svc->spatial_layer_id == 0 && + if ((!cpi->ppi->use_svc && rc->frames_to_key == 0) || + (cpi->ppi->use_svc && svc->spatial_layer_id == 0 && (cpi->oxcf.kf_cfg.key_freq_max == 0 || svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)) || (frame_flags & FRAMEFLAGS_KEY)) { frame_params->frame_type = KEY_FRAME; - rc->this_key_frame_forced = + p_rc->this_key_frame_forced = cm->current_frame.frame_number != 0 && rc->frames_to_key == 0; rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max; - rc->kf_boost = DEFAULT_KF_BOOST_RT; - gf_group->update_type[gf_group->index] = KF_UPDATE; - gf_group->frame_type[gf_group->index] = KEY_FRAME; - gf_group->refbuf_state[gf_group->index] = REFBUF_RESET; - if (cpi->use_svc) { + p_rc->kf_boost = DEFAULT_KF_BOOST_RT; + gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE; + gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME; + gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET; + if (cpi->ppi->use_svc) { if (cm->current_frame.frame_number > 0) av1_svc_reset_temporal_layers(cpi, 1); svc->layer_context[layer].is_key_frame = 1; } } else { frame_params->frame_type = INTER_FRAME; - gf_group->update_type[gf_group->index] = LF_UPDATE; - gf_group->frame_type[gf_group->index] = INTER_FRAME; - gf_group->refbuf_state[gf_group->index] = REFBUF_UPDATE; - if (cpi->use_svc) { + gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE; + gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME; + gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE; + if (cpi->ppi->use_svc) { LAYER_CONTEXT *lc = &svc->layer_context[layer]; lc->is_key_frame = svc->spatial_layer_id == 0 @@ -2596,7 +2775,7 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, } } // Check for scene change, for non-SVC for now. - if (!cpi->use_svc && cpi->sf.rt_sf.check_scene_detection) + if (!cpi->ppi->use_svc && cpi->sf.rt_sf.check_scene_detection) rc_scene_detection_onepass_rt(cpi); // Check for dynamic resize, for single spatial layer for now. // For temporal layers only check on base temporal layer. @@ -2628,14 +2807,14 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, target = av1_calc_iframe_target_size_one_pass_cbr(cpi); } else { target = av1_calc_pframe_target_size_one_pass_cbr( - cpi, gf_group->update_type[gf_group->index]); + cpi, gf_group->update_type[cpi->gf_frame_index]); } } else { if (frame_params->frame_type == KEY_FRAME) { target = av1_calc_iframe_target_size_one_pass_vbr(cpi); } else { target = av1_calc_pframe_target_size_one_pass_vbr( - cpi, gf_group->update_type[gf_group->index]); + cpi, gf_group->update_type[cpi->gf_frame_index]); } } if (cpi->oxcf.rc_cfg.mode == AOM_Q) @@ -2644,11 +2823,21 @@ void av1_get_one_pass_rt_params(AV1_COMP *cpi, av1_rc_set_frame_target(cpi, target, cm->width, cm->height); rc->base_frame_target = target; cm->current_frame.frame_type = frame_params->frame_type; + // For fixed mode SVC: if KSVC is enabled remove inter layer + // prediction on spatial enhancement layer frames for frames + // whose base is not KEY frame. + if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode && + svc->number_spatial_layers > 1 && + !svc->layer_context[layer].is_key_frame) { + ExternalFlags *const ext_flags = &cpi->ext_flags; + ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; + } } int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; SPEED_FEATURES *const sf = &cpi->sf; int thresh_qp = 7 * (rc->worst_quality >> 3); // Lower thresh_qp for video (more overshoot at lower Q) to be @@ -2670,8 +2859,8 @@ int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) { // have settled down to a very different (low QP) state, then not adjusting // them may cause next frame to select low QP and overshoot again. cpi->rc.avg_frame_qindex[INTER_FRAME] = *q; - rc->buffer_level = rc->optimal_buffer_level; - rc->bits_off_target = rc->optimal_buffer_level; + rc->buffer_level = p_rc->optimal_buffer_level; + rc->bits_off_target = p_rc->optimal_buffer_level; // Reset rate under/over-shoot flags. cpi->rc.rc_1_frame = 0; cpi->rc.rc_2_frame = 0; @@ -2680,7 +2869,7 @@ int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) { (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs); // Rate correction factor based on target_bits_per_mb and qp (==max_QP). // This comes from the inverse computation of vp9_rc_bits_per_mb(). - q2 = av1_convert_qindex_to_q(*q, cm->seq_params.bit_depth); + q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth); enumerator = 1800000; // Factor for inter frame. enumerator += (int)(enumerator * q2) >> 12; new_correction_factor = (double)target_bits_per_mb * q2 / enumerator; diff --git a/third_party/libaom/source/libaom/av1/encoder/ratectrl.h b/third_party/libaom/source/libaom/av1/encoder/ratectrl.h index 3f1756f5ca..a1567f038c 100644 --- a/third_party/libaom/source/libaom/av1/encoder/ratectrl.h +++ b/third_party/libaom/source/libaom/av1/encoder/ratectrl.h @@ -129,11 +129,6 @@ typedef struct { int this_frame_target; // Actual frame target after rc adjustment. /*! - * Target bit budget for the current GF / ARF group of frame. - */ - int64_t gf_group_bits; - - /*! * Projected size for current frame */ int projected_frame_size; @@ -159,20 +154,6 @@ typedef struct { int last_boosted_qindex; /*! - * Q used for last boosted (non leaf) frame - */ - int last_kf_qindex; - - /*! - * Boost factor used to calculate the extra bits allocated to ARFs and GFs - */ - int gfu_boost; - /*! - * Boost factor used to calculate the extra bits allocated to the key frame - */ - int kf_boost; - - /*! * Correction factors used to adjust the q estimate for a given target rate * in the encode loop. */ @@ -193,28 +174,10 @@ typedef struct { */ int intervals_till_gf_calculate_due; - /*! - * Stores the determined gf group lengths for a set of gf groups - */ - int gf_intervals[MAX_NUM_GF_INTERVALS]; - - /*! - * The current group's index into gf_intervals[] - */ - int cur_gf_index; - /*!\cond */ - int num_regions; - REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES]; - double cor_coeff[MAX_FIRSTPASS_ANALYSIS_FRAMES]; - int regions_offset; // offset of regions from the last keyframe - int frames_till_regions_update; - int min_gf_interval; int max_gf_interval; int static_scene_max_gf_interval; - int baseline_gf_interval; - int constrained_gf_group; /*!\endcond */ /*! * Frames before the next key frame @@ -222,8 +185,6 @@ typedef struct { int frames_to_key; /*!\cond */ int frames_since_key; - int this_key_frame_forced; - int next_key_frame_forced; int is_src_frame_alt_ref; int sframe_due; @@ -269,18 +230,6 @@ typedef struct { */ int best_quality; - /*! - * Initial buffuer level in ms for CBR / low delay encoding - */ - int64_t starting_buffer_level; - /*! - * Optimum / target buffuer level in ms for CBR / low delay encoding - */ - int64_t optimal_buffer_level; - /*! - * Maximum target buffuer level in ms for CBR / low delay encoding - */ - int64_t maximum_buffer_size; /*!\cond */ // rate control history for last frame(1) and the frame before(2). @@ -292,14 +241,8 @@ typedef struct { int q_1_frame; int q_2_frame; - float_t arf_boost_factor; - /*!\endcond */ /*! - * Q index used for ALT frame - */ - int arf_q; - /*! * Proposed maximum alloed Q for current frame */ int active_worst_quality; @@ -309,35 +252,119 @@ typedef struct { int active_best_quality[MAX_ARF_LAYERS + 1]; /*!\cond */ + // Track amount of low motion in scene + int avg_frame_low_motion; + + // For dynamic resize, 1 pass cbr. + RESIZE_STATE resize_state; + int resize_avg_qp; + int resize_buffer_underflow; + int resize_count; + /*!\endcond */ +} RATE_CONTROL; + +/*! + * \brief Primary Rate Control parameters and status + */ +typedef struct { + // Sub-gop level Rate targetting variables + + /*! + * Target bit budget for the current GF / ARF group of frame. + */ + int64_t gf_group_bits; + + /*! + * Boost factor used to calculate the extra bits allocated to the key frame + */ + int kf_boost; + + /*! + * Boost factor used to calculate the extra bits allocated to ARFs and GFs + */ + int gfu_boost; + + /*! + * Stores the determined gf group lengths for a set of gf groups + */ + int gf_intervals[MAX_NUM_GF_INTERVALS]; + + /*! + * The current group's index into gf_intervals[] + */ + int cur_gf_index; + + /*!\cond */ + int num_regions; + + REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES]; + int regions_offset; // offset of regions from the last keyframe + int frames_till_regions_update; + + int baseline_gf_interval; + + int constrained_gf_group; + + int this_key_frame_forced; + + int next_key_frame_forced; + /*!\endcond */ + + /*! + * Initial buffuer level in ms for CBR / low delay encoding + */ + int64_t starting_buffer_level; + + /*! + * Optimum / target buffuer level in ms for CBR / low delay encoding + */ + int64_t optimal_buffer_level; + + /*! + * Maximum target buffuer level in ms for CBR / low delay encoding + */ + int64_t maximum_buffer_size; + + /*! + * Q index used for ALT frame + */ + int arf_q; + + /*!\cond */ + float_t arf_boost_factor; + int base_layer_qp; // Total number of stats used only for kf_boost calculation. int num_stats_used_for_kf_boost; + // Total number of stats used only for gfu_boost calculation. int num_stats_used_for_gfu_boost; + // Total number of stats required by gfu_boost calculation. int num_stats_required_for_gfu_boost; + int next_is_fwd_key; + int enable_scenecut_detection; - int use_arf_in_this_kf_group; - // Track amount of low motion in scene - int avg_frame_low_motion; - // For dynamic resize, 1 pass cbr. - RESIZE_STATE resize_state; - int resize_avg_qp; - int resize_buffer_underflow; - int resize_count; + int use_arf_in_this_kf_group; /*!\endcond */ -} RATE_CONTROL; -/*!\cond */ + /*! + * Q used for last boosted (non leaf) frame + */ + int last_kf_qindex; +} PRIMARY_RATE_CONTROL; struct AV1_COMP; struct AV1EncoderConfig; +void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf, + PRIMARY_RATE_CONTROL *p_rc); + void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass, - RATE_CONTROL *rc); + RATE_CONTROL *rc, const PRIMARY_RATE_CONTROL *const p_rc); int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, double correction_factor, aom_bit_depth_t bit_depth, @@ -415,7 +442,6 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, * * \ingroup rate_control * \param[in] cpi Top level encoder structure - * \param[in,out] rc Top level rate control structure * \param[in] width Coded frame width * \param[in] height Coded frame height * \param[in] gf_index Index of this frame in the golden frame group @@ -424,9 +450,8 @@ void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi, * \return Returns selected q index to be used for encoding this frame. * Also, updates \c rc->arf_q. */ -int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, RATE_CONTROL *rc, - int width, int height, int gf_index, - int *bottom_index, int *top_index); +int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height, + int gf_index, int *bottom_index, int *top_index); /*!\brief Estimates q to achieve a target bits per frame * diff --git a/third_party/libaom/source/libaom/av1/encoder/rc_utils.h b/third_party/libaom/source/libaom/av1/encoder/rc_utils.h index 98cec2e003..0a9d02d17b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/rc_utils.h +++ b/third_party/libaom/source/libaom/av1/encoder/rc_utils.h @@ -19,18 +19,45 @@ extern "C" { #endif -static AOM_INLINE void set_rc_buffer_sizes(RATE_CONTROL *rc, - const RateControlCfg *rc_cfg) { +static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) { + RATE_CONTROL *rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + if (cpi->common.current_frame.frame_number > + (unsigned int)cpi->svc.number_spatial_layers) { + if (cpi->ppi->use_svc) { + av1_svc_check_reset_layer_rc_flag(cpi); + } else { + if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) || + rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) { + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + rc->bits_off_target = p_rc->optimal_buffer_level; + rc->buffer_level = p_rc->optimal_buffer_level; + } + } + } +} + +static AOM_INLINE void set_rc_buffer_sizes(AV1_COMP *cpi) { + RATE_CONTROL *rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + const int64_t bandwidth = rc_cfg->target_bandwidth; const int64_t starting = rc_cfg->starting_buffer_level_ms; const int64_t optimal = rc_cfg->optimal_buffer_level_ms; const int64_t maximum = rc_cfg->maximum_buffer_size_ms; - rc->starting_buffer_level = starting * bandwidth / 1000; - rc->optimal_buffer_level = + p_rc->starting_buffer_level = starting * bandwidth / 1000; + p_rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000; - rc->maximum_buffer_size = + p_rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + rc->bits_off_target = AOMMIN(rc->bits_off_target, p_rc->maximum_buffer_size); + rc->buffer_level = AOMMIN(rc->buffer_level, p_rc->maximum_buffer_size); } static AOM_INLINE void config_target_level(AV1_COMP *const cpi, @@ -38,7 +65,7 @@ static AOM_INLINE void config_target_level(AV1_COMP *const cpi, aom_clear_system_state(); AV1EncoderConfig *const oxcf = &cpi->oxcf; - SequenceHeader *const seq_params = &cpi->common.seq_params; + SequenceHeader *const seq_params = cpi->common.seq_params; TileConfig *const tile_cfg = &oxcf->tile_cfg; RateControlCfg *const rc_cfg = &oxcf->rc_cfg; @@ -48,11 +75,11 @@ static AOM_INLINE void config_target_level(AV1_COMP *const cpi, av1_get_max_bitrate_for_level(target_level, tier, profile); const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70); rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate); - // Also need to update cpi->twopass.bits_left. - TWO_PASS *const twopass = &cpi->twopass; + // Also need to update cpi->ppi->twopass.bits_left. + TWO_PASS *const twopass = &cpi->ppi->twopass; FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats; if (stats != NULL) - cpi->twopass.bits_left = + cpi->ppi->twopass.bits_left = (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0); // Adjust max over-shoot percentage. @@ -226,6 +253,7 @@ static AOM_INLINE void recode_loop_update_q( int *const low_cr_seen, const int loop_count) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; *loop = 0; @@ -263,14 +291,15 @@ static AOM_INLINE void recode_loop_update_q( &frame_over_shoot_limit); if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; - if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced && + if (cm->current_frame.frame_type == KEY_FRAME && + p_rc->this_key_frame_forced && rc->projected_frame_size < rc->max_frame_bandwidth) { int64_t kf_err; const int64_t high_err_target = cpi->ambient_err; const int64_t low_err_target = cpi->ambient_err >> 1; #if CONFIG_AV1_HIGHBITDEPTH - if (cm->seq_params.use_highbitdepth) { + if (cm->seq_params->use_highbitdepth) { kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); } else { kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); @@ -323,11 +352,11 @@ static AOM_INLINE void recode_loop_update_q( if (*q == *q_high && rc->projected_frame_size >= rc->max_frame_bandwidth) { const double q_val_high_current = - av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth); + av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth); const double q_val_high_new = q_val_high_current * ((double)rc->projected_frame_size / rc->max_frame_bandwidth); - *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth, + *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth, rc->best_quality, rc->worst_quality); } diff --git a/third_party/libaom/source/libaom/av1/encoder/rd.c b/third_party/libaom/source/libaom/av1/encoder/rd.c index 389b4bfe3b..e361264f16 100644 --- a/third_party/libaom/source/libaom/av1/encoder/rd.c +++ b/third_party/libaom/source/libaom/av1/encoder/rd.c @@ -354,11 +354,45 @@ static const int rd_layer_depth_factor[7] = { 160, 160, 160, 160, 192, 208, 224 }; +// Returns the default rd multiplier for inter frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_inter_rd_multiplier(int qindex) { + return 3.2 + (0.0035 * (double)qindex); +} + +// Returns the default rd multiplier for ARF/Golden Frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_arf_rd_multiplier(int qindex) { + return 3.25 + (0.0035 * (double)qindex); +} + +// Returns the default rd multiplier for key frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_kf_rd_multiplier(int qindex) { + return 3.3 + (0.0035 * (double)qindex); +} + int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) { - const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth); - int rdmult = (int)(((int64_t)88 * q * q) / 24); + const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params->bit_depth); + const FRAME_UPDATE_TYPE update_type = + cpi->ppi->gf_group.update_type[cpi->gf_frame_index]; + int rdmult = q * q; + + if (update_type == KF_UPDATE) { + double def_rd_q_mult = def_kf_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult); + } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) { + double def_rd_q_mult = def_arf_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult); + } else { + double def_rd_q_mult = def_inter_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult); + } - switch (cpi->common.seq_params.bit_depth) { + switch (cpi->common.seq_params->bit_depth) { case AOM_BITS_8: break; case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; @@ -373,9 +407,10 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) { int64_t rdmult = av1_compute_rd_mult_based_on_qindex(cpi, qindex); if (is_stat_consumption_stage(cpi) && (cpi->common.current_frame.frame_type != KEY_FRAME)) { - const GF_GROUP *const gf_group = &cpi->gf_group; - const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100)); - const int layer_depth = AOMMIN(gf_group->layer_depth[gf_group->index], 6); + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100)); + const int layer_depth = + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6); // Layer depth adjustment rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7; @@ -386,21 +421,30 @@ int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) { return (int)rdmult; } -int av1_get_deltaq_offset(const AV1_COMP *cpi, int qindex, double beta) { +int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) { assert(beta > 0.0); - int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth); + int q = av1_dc_quant_QTX(qindex, 0, bit_depth); int newq = (int)rint(q / sqrt(beta)); int orig_qindex = qindex; + if (newq == q) { + return 0; + } if (newq < q) { - do { + while (qindex > 0) { qindex--; - q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth); - } while (newq < q && qindex > 0); + q = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (newq >= q) { + break; + } + } } else { - do { + while (qindex < MAXQ) { qindex++; - q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth); - } while (newq > q && qindex < MAXQ); + q = av1_dc_quant_QTX(qindex, 0, bit_depth); + if (newq <= q) { + break; + } + } } return qindex - orig_qindex; } @@ -409,7 +453,7 @@ int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) { assert(beta > 0.0); const AV1_COMMON *cm = &cpi->common; int q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0, - cm->seq_params.bit_depth); + cm->seq_params->bit_depth); return (int)(av1_compute_rd_mult(cpi, q) / beta); } @@ -433,7 +477,7 @@ static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) { } void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) { - switch (cpi->common.seq_params.bit_depth) { + switch (cpi->common.seq_params->bit_depth) { case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break; case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break; case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break; @@ -450,7 +494,7 @@ static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) { av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) + cm->quant_params.y_dc_delta_q, 0, MAXQ); - const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth); + const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth); for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { // Threshold here seems unnecessarily harsh but fine given actual @@ -577,6 +621,13 @@ void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp, } } +void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) { + dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX]; + dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX]; + av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc, + MV_SUBPEL_NONE); +} + void av1_initialize_rd_consts(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->td.mb; @@ -610,14 +661,9 @@ void av1_initialize_rd_consts(AV1_COMP *cpi) { cost_upd_freq.mode == COST_UPD_TILE || fill_costs) av1_fill_mode_rates(cm, &x->mode_costs, cm->fc); - if (!use_nonrd_pick_mode && frame_is_intra_only(cm) && - cm->features.allow_screen_content_tools && + if (!use_nonrd_pick_mode && av1_allow_intrabc(cm) && !is_stat_generation_stage(cpi)) { - IntraBCMVCosts *const dv_costs = &cpi->dv_costs; - int *dvcost[2] = { &dv_costs->mv_component[0][MV_MAX], - &dv_costs->mv_component[1][MV_MAX] }; - av1_build_nmv_cost_table(dv_costs->joint_mv, dvcost, &cm->fc->ndvc, - MV_SUBPEL_NONE); + av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs); } } @@ -1016,12 +1062,16 @@ void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, const uint8_t *const ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col]; // Find sad for current vector. - const int this_sad = cpi->fn_ptr[block_size].sdf( + const int this_sad = cpi->ppi->fn_ptr[block_size].sdf( src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride); // Note if it is the best so far. if (this_sad < best_sad) { best_sad = this_sad; } + if (i == 0) + x->pred_mv0_sad[ref_frame] = this_sad; + else if (i == 1) + x->pred_mv1_sad[ref_frame] = this_sad; } // Note the index of the mv that worked best in the reference list. @@ -1287,7 +1337,7 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, const THR_MODES top_mode = MAX_MODES; const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT; - const int bsize_is_1_to_4 = bsize > cm->seq_params.sb_size; + const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size; BLOCK_SIZE min_size, max_size; if (bsize_is_1_to_4) { // This part handles block sizes with 1:4 and 4:1 aspect ratios @@ -1296,7 +1346,7 @@ void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, max_size = bsize; } else { min_size = AOMMAX(bsize - 2, BLOCK_4X4); - max_size = AOMMIN(bsize + 2, (int)cm->seq_params.sb_size); + max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size); } for (THR_MODES mode = 0; mode < top_mode; ++mode) { diff --git a/third_party/libaom/source/libaom/av1/encoder/rd.h b/third_party/libaom/source/libaom/av1/encoder/rd.h index e37c86b9d5..c1ba819ae2 100644 --- a/third_party/libaom/source/libaom/av1/encoder/rd.h +++ b/third_party/libaom/source/libaom/av1/encoder/rd.h @@ -81,20 +81,6 @@ typedef struct RD_OPT { double r0; } RD_OPT; -typedef struct { - // Cost of transmitting the actual motion vector. - // mv_component[0][i] is the cost of motion vector with horizontal component - // (mv_row) equal to i - MV_MAX. - // mv_component[1][i] is the cost of motion vector with vertical component - // (mv_col) equal to i - MV_MAX. - int mv_component[2][MV_VALS]; - - // joint_mv[i] is the cost of transmitting joint mv(MV_JOINT_TYPE) of - // type i. - // TODO(huisu@google.com): we can update dv_joint_cost per SB. - int joint_mv[MV_JOINTS]; -} IntraBCMVCosts; - static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) { #if CONFIG_RD_DEBUG int plane; @@ -110,12 +96,6 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) { // encoded, as there will only be 1 plane for (plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats->txb_coeff_cost[plane] = 0; - { - int r, c; - for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) - for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) - rd_stats->txb_coeff_cost_map[plane][r][c] = 0; - } } #endif } @@ -135,19 +115,18 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) { // encoded, as there will only be 1 plane for (plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats->txb_coeff_cost[plane] = INT_MAX; - { - int r, c; - for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) - for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) - rd_stats->txb_coeff_cost_map[plane][r][c] = INT16_MAX; - } } #endif } static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, const RD_STATS *rd_stats_src) { - assert(rd_stats_dst->rate != INT_MAX && rd_stats_src->rate != INT_MAX); + if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) { + // If rd_stats_dst or rd_stats_src has invalid rate, we will make + // rd_stats_dst invalid. + av1_invalid_rd_stats(rd_stats_dst); + return; + } rd_stats_dst->rate = (int)AOMMIN( ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX); if (!rd_stats_dst->zero_rate) @@ -160,18 +139,6 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, // encoded, as there will only be 1 plane for (int plane = 0; plane < MAX_MB_PLANE; ++plane) { rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane]; - { - // TODO(angiebird): optimize this part - int r, c; - int ref_txb_coeff_cost = 0; - for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) - for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { - rd_stats_dst->txb_coeff_cost_map[plane][r][c] += - rd_stats_src->txb_coeff_cost_map[plane][r][c]; - ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c]; - } - assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]); - } } #endif } @@ -375,9 +342,11 @@ void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc, void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp, MvCosts *mv_costs); +void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs); + int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta); -int av1_get_deltaq_offset(const struct AV1_COMP *cpi, int qindex, double beta); +int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta); #ifdef __cplusplus } // extern "C" diff --git a/third_party/libaom/source/libaom/av1/encoder/rdopt.c b/third_party/libaom/source/libaom/av1/encoder/rdopt.c index 6200ac11dd..3ca0cb4143 100644 --- a/third_party/libaom/source/libaom/av1/encoder/rdopt.c +++ b/third_party/libaom/source/libaom/av1/encoder/rdopt.c @@ -627,8 +627,8 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x, get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y); unsigned int sse; - cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, - &sse); + cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); total_sse += sse; if (!plane && sse_y) *sse_y = sse; } @@ -1156,13 +1156,16 @@ static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x, int_mv best_mv; av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range, - mode_info, &best_mv); + mode_info, &best_mv, args); if (best_mv.as_int == INVALID_MV) return INT64_MAX; args->single_newmv[ref_mv_idx][refs[0]] = best_mv; args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv; args->single_newmv_valid[ref_mv_idx][refs[0]] = 1; cur_mv[0].as_int = best_mv.as_int; + + // Return after single_newmv is set. + if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX; } return 0; @@ -1276,7 +1279,7 @@ static int64_t motion_mode_rd( uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE]; const int rate_mv0 = *rate_mv; - const int interintra_allowed = cm->seq_params.enable_interintra_compound && + const int interintra_allowed = cm->seq_params->enable_interintra_compound && is_interintra_allowed(mbmi) && mbmi->compound_idx; WARP_SAMPLE_INFO *const warp_sample_info = @@ -1319,7 +1322,7 @@ static int64_t motion_mode_rd( const int switchable_rate = av1_is_interp_needed(xd) ? av1_get_switchable_rate(x, xd, interp_filter, - cm->seq_params.enable_dual_filter) + cm->seq_params->enable_dual_filter) : 0; int64_t best_rd = INT64_MAX; int best_rate_mv = rate_mv0; @@ -1355,11 +1358,18 @@ static int64_t motion_mode_rd( // Do not search OBMC if the probability of selecting it is below a // predetermined threshold for this update_type and block size. - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); - const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] < - cpi->sf.inter_sf.prune_obmc_prob_thresh; - if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || - cpi->sf.rt_sf.use_nonrd_pick_mode || prune_obmc) && + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int obmc_probability; +#if CONFIG_FRAME_PARALLEL_ENCODE + obmc_probability = + cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize]; +#else + obmc_probability = cpi->frame_probs.obmc_probs[update_type][bsize]; +#endif + const int prune_obmc = + obmc_probability < cpi->sf.inter_sf.prune_obmc_prob_thresh; + if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) && mbmi->motion_mode == OBMC_CAUSAL) continue; @@ -1373,7 +1383,7 @@ static int64_t motion_mode_rd( assert(!is_comp_pred); if (have_newmv_in_inter_mode(this_mode)) { av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL, - &mbmi->mv[0]); + &mbmi->mv[0], NULL); tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; } if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) { @@ -1897,10 +1907,11 @@ static bool ref_mv_idx_early_breakout( } // Compute the estimated RD cost for the motion vector with simple translation. -static int64_t simple_translation_pred_rd( - AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, - HandleInterModeArgs *args, int ref_mv_idx, inter_mode_info *mode_info, - int64_t ref_best_rd, BLOCK_SIZE bsize) { +static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x, + RD_STATS *rd_stats, + HandleInterModeArgs *args, + int ref_mv_idx, int64_t ref_best_rd, + BLOCK_SIZE bsize) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; @@ -1933,7 +1944,6 @@ static int64_t simple_translation_pred_rd( const int drl_cost = get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); rd_stats->rate += drl_cost; - mode_info[ref_mv_idx].drl_cost = drl_cost; int_mv cur_mv[2]; if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) { @@ -1987,8 +1997,8 @@ static INLINE bool mask_check_bit(int mask, int index) { static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, HandleInterModeArgs *const args, - int64_t ref_best_rd, inter_mode_info *mode_info, - BLOCK_SIZE bsize, const int ref_set) { + int64_t ref_best_rd, BLOCK_SIZE bsize, + const int ref_set) { AV1_COMMON *const cm = &cpi->common; const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -2027,7 +2037,7 @@ static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x, continue; } idx_rdcost[ref_mv_idx] = simple_translation_pred_rd( - cpi, x, rd_stats, args, ref_mv_idx, mode_info, ref_best_rd, bsize); + cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize); } // Find the index with the best RD cost. int best_idx = 0; @@ -2171,14 +2181,17 @@ typedef struct { static AOM_INLINE void get_block_level_tpl_stats( AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs, PruneInfoFromTpl *inter_cost_info_from_tpl) { - const GF_GROUP *const gf_group = &cpi->gf_group; AV1_COMMON *const cm = &cpi->common; - assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size)); - const int tpl_idx = gf_group->index; - TplParams *const tpl_data = &cpi->tpl_data; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + if (tpl_idx >= MAX_TPL_FRAME_IDX) { + return; + } const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx]; - if (tpl_idx >= MAX_TPL_FRAME_IDX || !tpl_frame->is_valid) { + if (!tpl_frame->is_valid) { return; } @@ -2274,101 +2287,6 @@ static AOM_INLINE int prune_modes_based_on_tpl_stats( return 0; } -// If the current mode being searched is NEWMV, this function will look -// at previously searched MVs and check if they are the same -// as the current MV. If it finds that this MV is repeated, it compares -// the cost to the previous MV and skips the rest of the search if it is -// more expensive. -static int skip_repeated_newmv( - AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - const int do_tx_search, const PREDICTION_MODE this_mode, - MB_MODE_INFO *best_mbmi, motion_mode_candidate *motion_mode_cand, - int64_t *ref_best_rd, RD_STATS *best_rd_stats, RD_STATS *best_rd_stats_y, - RD_STATS *best_rd_stats_uv, inter_mode_info *mode_info, - HandleInterModeArgs *args, int drl_cost, const int *refs, int_mv *cur_mv, - int64_t *best_rd, const BUFFER_SET orig_dst, int ref_mv_idx) { - // This feature only works for NEWMV when a previous mv has been searched - if (this_mode != NEWMV || ref_mv_idx == 0) return 0; - MACROBLOCKD *xd = &x->e_mbd; - const AV1_COMMON *cm = &cpi->common; - const int num_planes = av1_num_planes(cm); - - int skip = 0; - int this_rate_mv = 0; - int i; - for (i = 0; i < ref_mv_idx; ++i) { - // Check if the motion search result same as previous results - if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int && - args->single_newmv_valid[i][refs[0]]) { - // If the compared mode has no valid rd, it is unlikely this - // mode will be the best mode - if (mode_info[i].rd == INT64_MAX) { - skip = 1; - break; - } - // Compare the cost difference including drl cost and mv cost - if (mode_info[i].mv.as_int != INVALID_MV) { - const int compare_cost = mode_info[i].rate_mv + mode_info[i].drl_cost; - const int_mv ref_mv = av1_get_ref_mv(x, 0); - this_rate_mv = av1_mv_bit_cost( - &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost, - x->mv_costs->mv_cost_stack, MV_COST_WEIGHT); - const int this_cost = this_rate_mv + drl_cost; - - if (compare_cost <= this_cost) { - // Skip this mode if it is more expensive as the previous result - // for this MV - skip = 1; - break; - } else { - // If the cost is less than current best result, make this - // the best and update corresponding variables unless the - // best_mv is the same as ref_mv. In this case we skip and - // rely on NEAR(EST)MV instead - if (best_mbmi->ref_mv_idx == i && - best_mbmi->mv[0].as_int != ref_mv.as_int) { - assert(*best_rd != INT64_MAX); - assert(best_mbmi->mv[0].as_int == mode_info[i].mv.as_int); - best_mbmi->ref_mv_idx = ref_mv_idx; - motion_mode_cand->rate_mv = this_rate_mv; - best_rd_stats->rate += this_cost - compare_cost; - *best_rd = - RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist); - // We also need to update mode_info here because we are setting - // (ref_)best_rd here. So we will not be able to search the same - // mode again with the current configuration. - mode_info[ref_mv_idx].mv.as_int = best_mbmi->mv[0].as_int; - mode_info[ref_mv_idx].rate_mv = this_rate_mv; - mode_info[ref_mv_idx].rd = *best_rd; - if (*best_rd < *ref_best_rd) *ref_best_rd = *best_rd; - break; - } - } - } - } - } - if (skip) { - const THR_MODES mode_enum = get_prediction_mode_idx( - best_mbmi->mode, best_mbmi->ref_frame[0], best_mbmi->ref_frame[1]); - // Collect mode stats for multiwinner mode processing - store_winner_mode_stats( - &cpi->common, x, best_mbmi, best_rd_stats, best_rd_stats_y, - best_rd_stats_uv, mode_enum, NULL, bsize, *best_rd, - cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search); - args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = - args->modelled_rd[this_mode][i][refs[0]]; - args->simple_rd[this_mode][ref_mv_idx][refs[0]] = - args->simple_rd[this_mode][i][refs[0]]; - mode_info[ref_mv_idx].rd = mode_info[i].rd; - mode_info[ref_mv_idx].rate_mv = this_rate_mv; - mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int; - - restore_dst_buf(xd, orig_dst, num_planes); - return 1; - } - return 0; -} - /*!\brief High level function to select parameters for compound mode. * * \ingroup inter_mode_search @@ -2427,7 +2345,7 @@ static int process_compound_inter_mode( MB_MODE_INFO *mbmi = xd->mi[0]; const AV1_COMMON *cm = &cpi->common; const int masked_compound_used = is_any_masked_compound_used(bsize) && - cm->seq_params.enable_masked_compound; + cm->seq_params->enable_masked_compound; int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) | (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD); @@ -2506,6 +2424,76 @@ static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx, return 0; } +/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE + * + * \ingroup inter_mode_search + * + * Compares the sse of zero mv and the best sse found in single new_mv. If the + * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped. + * Else returns 0. + * + * Note that the sse of here comes from single_motion_search. So it is + * interpolated with the filter in motion search, not the actual interpolation + * filter used in encoding. + * + * \param[in] fn_ptr A table of function pointers to compute SSE. + * \param[in] x Pointer to struct holding all the data for + * the current macroblock. + * \param[in] bsize The current block_size. + * \param[in] args The args to handle_inter_mode, used to track + * the best SSE. + * \return Returns 1 if zero_mv is pruned, 0 otherwise. + */ +static AOM_INLINE int prune_zero_mv_with_sse( + const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize, + const HandleInterModeArgs *args) { + const MACROBLOCKD *xd = &x->e_mbd; + const MB_MODE_INFO *mbmi = xd->mi[0]; + + const int is_comp_pred = has_second_ref(mbmi); + const MV_REFERENCE_FRAME *refs = mbmi->ref_frame; + + // Check that the global mv is the same as ZEROMV + assert(mbmi->mv[0].as_int == 0); + assert(IMPLIES(is_comp_pred, mbmi->mv[0].as_int == 0)); + assert(xd->global_motion[refs[0]].wmtype == TRANSLATION || + xd->global_motion[refs[0]].wmtype == IDENTITY); + + // Don't prune if we have invalid data + for (int idx = 0; idx < 1 + is_comp_pred; idx++) { + assert(mbmi->mv[0].as_int == 0); + if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) { + return 0; + } + } + + // Sum up the sse of ZEROMV and best NEWMV + unsigned int this_sse_sum = 0; + unsigned int best_sse_sum = 0; + for (int idx = 0; idx < 1 + is_comp_pred; idx++) { + const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; + const struct macroblockd_plane *pd = xd->plane; + const struct buf_2d *src_buf = &p->src; + const struct buf_2d *ref_buf = &pd->pre[idx]; + const uint8_t *src = src_buf->buf; + const uint8_t *ref = ref_buf->buf; + const int src_stride = src_buf->stride; + const int ref_stride = ref_buf->stride; + + unsigned int this_sse; + fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse); + this_sse_sum += this_sse; + + const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]]; + best_sse_sum += best_sse; + } + if (this_sse_sum > best_sse_sum) { + return 1; + } + + return 0; +} + /*!\brief AV1 inter mode RD computation * * \ingroup inter_mode_search @@ -2589,12 +2577,11 @@ static int64_t handle_inter_mode( const int is_comp_pred = has_second_ref(mbmi); const PREDICTION_MODE this_mode = mbmi->mode; - const GF_GROUP *const gf_group = &cpi->gf_group; - const int tpl_idx = gf_group->index; - TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx]; + const int tpl_idx = cpi->gf_frame_index; + TplParams *const tpl_data = &cpi->ppi->tpl_data; const int prune_modes_based_on_tpl = cpi->sf.inter_sf.prune_inter_modes_based_on_tpl && - tpl_idx < MAX_TPL_FRAME_IDX && tpl_frame->is_valid; + tpl_idx < MAX_TPL_FRAME_IDX && tpl_data->tpl_frame[tpl_idx].is_valid; int i; // Reference frames for this mode const int refs[2] = { mbmi->ref_frame[0], @@ -2606,10 +2593,10 @@ static int64_t handle_inter_mode( // of these currently holds the best predictor, and use the other // one for future predictions. In the end, copy from tmp_buf to // dst if necessary. - struct macroblockd_plane *p = xd->plane; + struct macroblockd_plane *pd = xd->plane; const BUFFER_SET orig_dst = { - { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, - { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, + { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, }; const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE, tmp_buf + 2 * MAX_SB_SQUARE }, @@ -2645,8 +2632,8 @@ static int64_t handle_inter_mode( // Save MV results from first 2 ref_mv_idx. int_mv save_mv[MAX_REF_MV_SEARCH - 1][2]; int best_ref_mv_idx = -1; - const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, - mode_info, bsize, ref_set); + const int idx_mask = + ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set); const int16_t mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); const ModeCosts *mode_costs = &x->mode_costs; @@ -2669,9 +2656,14 @@ static int64_t handle_inter_mode( // WARPED_CAUSAL) // 6.) Update stats if best so far for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) { + mbmi->ref_mv_idx = ref_mv_idx; + mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV; - mode_info[ref_mv_idx].mv.as_int = INVALID_MV; - mode_info[ref_mv_idx].rd = INT64_MAX; + mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX; + const int drl_cost = get_drl_cost( + mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); + mode_info[ref_mv_idx].drl_cost = drl_cost; + mode_info[ref_mv_idx].skip = 0; if (!mask_check_bit(idx_mask, ref_mv_idx)) { // MV did not perform well in simple translation search. Skip it. @@ -2695,14 +2687,10 @@ static int64_t handle_inter_mode( mbmi->num_proj_ref = 0; mbmi->motion_mode = SIMPLE_TRANSLATION; - mbmi->ref_mv_idx = ref_mv_idx; // Compute cost for signalling this DRL index rd_stats->rate = base_rate; - const int drl_cost = get_drl_cost( - mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type); rd_stats->rate += drl_cost; - mode_info[ref_mv_idx].drl_cost = drl_cost; int rs = 0; int compmode_interinter_cost = 0; @@ -2731,17 +2719,16 @@ static int64_t handle_inter_mode( if (newmv_ret_val != 0) continue; - rd_stats->rate += rate_mv; + if (is_inter_singleref_mode(this_mode) && + cur_mv[0].as_int != INVALID_MV) { + const MV_REFERENCE_FRAME ref = refs[0]; + const unsigned int this_sse = x->pred_sse[ref]; + if (this_sse < args->best_single_sse_in_refs[ref]) { + args->best_single_sse_in_refs[ref] = this_sse; + } + } - // skip NEWMV mode in drl if the motion search result is the same - // as a previous result - if (cpi->sf.inter_sf.skip_repeated_newmv && - skip_repeated_newmv(cpi, x, bsize, do_tx_search, this_mode, - &best_mbmi, motion_mode_cand, &ref_best_rd, - &best_rd_stats, &best_rd_stats_y, - &best_rd_stats_uv, mode_info, args, drl_cost, - refs, cur_mv, &best_rd, orig_dst, ref_mv_idx)) - continue; + rd_stats->rate += rate_mv; } // Copy the motion vector for this mode into mbmi struct for (i = 0; i < is_comp_pred + 1; ++i) { @@ -2760,6 +2747,14 @@ static int64_t handle_inter_mode( cpi->sf.inter_sf.prune_ref_mv_idx_search)) continue; + if (cpi->sf.gm_sf.prune_zero_mv_with_sse && + cpi->sf.gm_sf.gm_search_type == GM_DISABLE_SEARCH && + (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) { + if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args)) { + continue; + } + } + #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, compound_type_rd_time); #endif @@ -2843,12 +2838,6 @@ static int64_t handle_inter_mode( if (ret_val != INT64_MAX) { int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (tmp_rd < mode_info[ref_mv_idx].rd) { - // Only update mode_info if the new result is actually better. - mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int; - mode_info[ref_mv_idx].rate_mv = rate_mv; - mode_info[ref_mv_idx].rd = tmp_rd; - } const THR_MODES mode_enum = get_prediction_mode_idx( mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); // Collect mode stats for multiwinner mode processing @@ -2928,11 +2917,11 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, const int mi_col = xd->mi_col; const int w = block_size_wide[bsize]; const int h = block_size_high[bsize]; - const int sb_row = mi_row >> cm->seq_params.mib_size_log2; - const int sb_col = mi_col >> cm->seq_params.mib_size_log2; + const int sb_row = mi_row >> cm->seq_params->mib_size_log2; + const int sb_col = mi_col >> cm->seq_params->mib_size_log2; MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext; - MV_REFERENCE_FRAME ref_frame = INTRA_FRAME; + const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME; av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs, mbmi_ext->mode_context); @@ -2952,7 +2941,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv; if (dv_ref.as_int == 0) { - av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row); + av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row); } // Ref DV should not have sub-pel. assert((dv_ref.as_mv.col & 7) == 0); @@ -2983,7 +2972,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize, &dv_ref.as_mv, lookahead_search_sites, /*fine_search_interval=*/0); - const IntraBCMVCosts *const dv_costs = &cpi->dv_costs; + const IntraBCMVCosts *const dv_costs = x->dv_costs; av1_set_ms_to_intra_mode(&fullms_params, dv_costs); for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE; @@ -2997,19 +2986,19 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, fullms_params.mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE; fullms_params.mv_limits.row_max = - (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h; + (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h; break; case IBC_MOTION_LEFT: fullms_params.mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE; fullms_params.mv_limits.col_max = - (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w; + (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w; // TODO(aconverse@google.com): Minimize the overlap between above and // left areas. fullms_params.mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE; int bottom_coded_mi_edge = - AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end); + AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end); fullms_params.mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h; break; @@ -3047,7 +3036,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, get_fullmv_from_mv(&dv))) continue; if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize, - cm->seq_params.mib_size_log2)) + cm->seq_params->mib_size_log2)) continue; // DV should not have sub-pel. @@ -3065,12 +3054,10 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, av1_num_planes(cm) - 1); - int *dvcost[2] = { (int *)&dv_costs->mv_component[0][MV_MAX], - (int *)&dv_costs->mv_component[1][MV_MAX] }; // TODO(aconverse@google.com): The full motion field defining discount // in MV_COST_WEIGHT is too large. Explore other values. const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv, - dvcost, MV_COST_WEIGHT_SUB); + dv_costs->dv_costs, MV_COST_WEIGHT_SUB); const int rate_mode = x->mode_costs.intrabc_cost[1]; RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv; if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y, @@ -3186,7 +3173,6 @@ static AOM_INLINE void rd_pick_skip_mode( const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - const TxfmSearchParams *txfm_params = &x->txfm_search_params; x->compound_idx = 1; // COMPOUND_AVERAGE RD_STATS skip_mode_rd_stats; @@ -3247,6 +3233,8 @@ static AOM_INLINE void rd_pick_skip_mode( mbmi->motion_mode = SIMPLE_TRANSLATION; mbmi->ref_mv_idx = 0; mbmi->skip_mode = mbmi->skip_txfm = 1; + mbmi->palette_mode_info.palette_size[0] = 0; + mbmi->palette_mode_info.palette_size[1] = 0; set_default_interp_filters(mbmi, cm->features.interp_filter); @@ -3283,45 +3271,12 @@ static AOM_INLINE void rd_pick_skip_mode( assert(mode_index != THR_INVALID); search_state->best_mbmode.skip_mode = 1; search_state->best_mbmode = *mbmi; - - search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip_txfm = - 1; - search_state->best_mbmode.mode = NEAREST_NEARESTMV; - search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0]; - search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1]; - search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int; - search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int; - search_state->best_mbmode.ref_mv_idx = 0; - - // Set up tx_size related variables for skip-specific loop filtering. - search_state->best_mbmode.tx_size = - block_signals_txsize(bsize) - ? tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type) - : max_txsize_rect_lookup[bsize]; memset(search_state->best_mbmode.inter_tx_size, search_state->best_mbmode.tx_size, sizeof(search_state->best_mbmode.inter_tx_size)); set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height, search_state->best_mbmode.skip_txfm && is_inter_block(mbmi), xd); - - // Set up color-related variables for skip mode. - search_state->best_mbmode.uv_mode = UV_DC_PRED; - search_state->best_mbmode.palette_mode_info.palette_size[0] = 0; - search_state->best_mbmode.palette_mode_info.palette_size[1] = 0; - - search_state->best_mbmode.comp_group_idx = 0; - search_state->best_mbmode.compound_idx = x->compound_idx; - search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE; - search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION; - - search_state->best_mbmode.interintra_mode = - (INTERINTRA_MODE)(II_DC_PRED - 1); - search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0; - - set_default_interp_filters(&search_state->best_mbmode, - cm->features.interp_filter); - search_state->best_mode_index = mode_index; // Update rd_cost @@ -3798,7 +3753,7 @@ static AOM_INLINE void set_params_rd_pick_inter_mode( // compound ref. if (skip_ref_frame_mask & (1 << ref_frame) && !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) && - !is_ref_frame_used_in_cache(ref_frame, x->intermode_cache)) { + !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) { continue; } assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL); @@ -3824,7 +3779,7 @@ static AOM_INLINE void set_params_rd_pick_inter_mode( } if (skip_ref_frame_mask & (1 << ref_frame) && - !is_ref_frame_used_in_cache(ref_frame, x->intermode_cache)) { + !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) { continue; } // Ref mv list population is not required, when compound references are @@ -3841,9 +3796,16 @@ static AOM_INLINE void set_params_rd_pick_inter_mode( } av1_count_overlappable_neighbors(cm, xd); - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); - const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] < - cpi->sf.inter_sf.prune_obmc_prob_thresh; + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int obmc_probability; +#if CONFIG_FRAME_PARALLEL_ENCODE + obmc_probability = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize]; +#else + obmc_probability = cpi->frame_probs.obmc_probs[update_type][bsize]; +#endif + const int prune_obmc = + obmc_probability < cpi->sf.inter_sf.prune_obmc_prob_thresh; if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) { if (check_num_overlappable_neighbors(mbmi) && is_motion_variation_allowed_bsize(bsize)) { @@ -3874,6 +3836,10 @@ static AOM_INLINE void set_params_rd_pick_inter_mode( set_mode_eval_params(cpi, x, MODE_EVAL); x->comp_rd_stats_idx = 0; + + for (int idx = 0; idx < REF_FRAMES; idx++) { + args->best_single_sse_in_refs[idx] = INT32_MAX; + } } static AOM_INLINE void init_inter_mode_search_state( @@ -4060,8 +4026,8 @@ static int inter_mode_search_order_independent_skip( } // Reuse the prediction mode in cache - if (x->use_intermode_cache) { - const MB_MODE_INFO *cached_mi = x->intermode_cache; + if (x->use_mb_mode_cache) { + const MB_MODE_INFO *cached_mi = x->mb_mode_cache; const PREDICTION_MODE cached_mode = cached_mi->mode; const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame; const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME; @@ -4156,12 +4122,12 @@ static int inter_mode_search_order_independent_skip( } // If we are reusing the prediction from cache, and the current frame is // required by the cache, then we cannot prune it. - if (is_ref_frame_used_in_cache(ref_type, x->intermode_cache)) { + if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) { skip_ref = 0; // If the cache only needs the current reference type for compound // prediction, then we can skip motion mode search. skip_motion_mode = (ref_type <= ALTREF_FRAME && - x->intermode_cache->ref_frame[1] > INTRA_FRAME); + x->mb_mode_cache->ref_frame[1] > INTRA_FRAME); } if (skip_ref) return 1; } @@ -4452,12 +4418,14 @@ static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi, // Prune compound mode using ref frames of neighbor blocks. static INLINE int compound_skip_using_neighbor_refs( MACROBLOCKD *const xd, const PREDICTION_MODE this_mode, - const MV_REFERENCE_FRAME *ref_frames, int prune_compound_using_neighbors) { + const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) { // Exclude non-extended compound modes from pruning if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV || this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV) return 0; + if (prune_ext_comp_using_neighbors >= 3) return 1; + int is_ref_match[2] = { 0 }; // 0 - match for forward refs // 1 - match for backward refs // Check if ref frames of this block matches with left neighbor. @@ -4472,7 +4440,7 @@ static INLINE int compound_skip_using_neighbor_refs( const int track_ref_match = is_ref_match[0] + is_ref_match[1]; // Pruning based on ref frame match with neighbors. - if (track_ref_match >= prune_compound_using_neighbors) return 0; + if (track_ref_match >= prune_ext_comp_using_neighbors) return 0; return 1; } @@ -4629,10 +4597,10 @@ static AOM_INLINE void evaluate_motion_mode_for_winner_candidates( if (!is_inter_singleref_mode(mbmi->mode)) continue; x->txfm_search_info.skip_txfm = 0; - struct macroblockd_plane *p = xd->plane; + struct macroblockd_plane *pd = xd->plane; const BUFFER_SET orig_dst = { - { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, - { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, + { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf }, + { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride }, }; set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); @@ -4681,8 +4649,7 @@ typedef struct { int skip_ref_frame_mask; int reach_first_comp_mode; int mode_thresh_mul_fact; - int intra_mode_idx_ls[INTRA_MODES]; - int intra_mode_num; + int *intra_mode_idx_ls; int num_single_modes_processed; int prune_cpd_using_sr_stats_ready; } InterModeSFArgs; @@ -4693,7 +4660,6 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, InterModeSFArgs *args) { const SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = xd->mi[0]; // Get the actual prediction mode we are trying in this iteration const THR_MODES mode_enum = av1_default_mode_order[midx]; const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; @@ -4703,6 +4669,8 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1]; const int comp_pred = second_ref_frame > INTRA_FRAME; + if (ref_frame == INTRA_FRAME) return 1; + // Check if this mode should be skipped because it is incompatible with the // current frame if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames)) @@ -4739,23 +4707,6 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, return 1; } - // Speed features to prune out INTRA frames - if (ref_frame == INTRA_FRAME) { - if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra || - sf->intra_sf.disable_smooth_intra) && - (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || - mbmi->mode == SMOOTH_V_PRED)) - return 1; - if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra && - mbmi->mode == PAETH_PRED) - return 1; - - // Intra modes will be handled in another loop later. - assert(args->intra_mode_num < INTRA_MODES); - args->intra_mode_idx_ls[args->intra_mode_num++] = mode_enum; - return 1; - } - if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) { // After we done with single reference modes, find the 2nd best RD // for a reference frame. Only search compound modes that have a reference @@ -4770,10 +4721,10 @@ static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize, return 1; } - if (sf->inter_sf.prune_compound_using_neighbors && comp_pred) { + if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) { if (compound_skip_using_neighbor_refs( xd, this_mode, ref_frames, - sf->inter_sf.prune_compound_using_neighbors)) + sf->inter_sf.prune_ext_comp_using_neighbors)) return 1; } @@ -4851,8 +4802,9 @@ static void tx_search_best_inter_candidates( : INT64_MAX; *yrd = INT64_MAX; int64_t best_rd_in_this_partition = INT64_MAX; + int num_inter_mode_cands = inter_modes_info->num; // Iterate over best inter mode candidates and perform tx search - for (int j = 0; j < inter_modes_info->num; ++j) { + for (int j = 0; j < num_inter_mode_cands; ++j) { const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; *mbmi = inter_modes_info->mbmi_arr[data_idx]; int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; @@ -4930,6 +4882,27 @@ static void tx_search_best_inter_candidates( update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum, x, txfm_search_done); search_state->best_skip_rd[0] = skip_rd; + // Limit the total number of modes to be evaluated if the first is valid + // and transform skip or compound + if (cpi->sf.inter_sf.inter_mode_txfm_breakout) { + if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) { + // Evaluate more candidates at high quantizers where occurrence of + // transform skip is high. + const int max_cands_cap[5] = { 2, 3, 5, 7, 9 }; + const int qindex_band = (5 * x->qindex) >> QINDEX_BITS; + num_inter_mode_cands = + AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num); + } else if (!j && has_second_ref(&search_state->best_mbmode)) { + const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1; + // Evaluate more candidates at low quantizers where occurrence of + // single reference mode is high. + const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 }, + { 10, 7, 5, 3 } }; + const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS; + num_inter_mode_cands = AOMMIN( + max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num); + } + } } } } @@ -5050,13 +5023,41 @@ static AOM_INLINE void search_intra_modes_in_interframe( const int num_4x4 = bsize_to_num_blk(bsize); // Performs luma search - for (int j = 0; j < sf_args->intra_mode_num; ++j) { + int64_t best_model_rd = INT64_MAX; + int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT]; + for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) { + top_intra_model_rd[i] = INT64_MAX; + } + for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT; + ++mode_idx) { if (sf->intra_sf.skip_intra_in_interframe && search_state->intra_search_state.skip_intra_modes) break; - const THR_MODES mode_enum = sf_args->intra_mode_idx_ls[j]; - const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; - const PREDICTION_MODE this_mode = mode_def->mode; + set_y_mode_and_delta_angle(mode_idx, mbmi); + + // Use intra_y_mode_mask speed feature to skip intra mode evaluation. + if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode)) + continue; + + THR_MODES mode_enum = 0; + for (int i = 0; i < INTRA_MODE_END; ++i) { + if (mbmi->mode == av1_mode_defs[sf_args->intra_mode_idx_ls[i]].mode) { + mode_enum = sf_args->intra_mode_idx_ls[i]; + break; + } + } + if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra || + cpi->sf.intra_sf.disable_smooth_intra) && + (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || + mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra && + mbmi->mode == PAETH_PRED) + continue; + if (av1_is_directional_mode(mbmi->mode) && + av1_use_angle_delta(bsize) == 0 && mbmi->angle_delta[PLANE_TYPE_Y] != 0) + continue; + const PREDICTION_MODE this_mode = mbmi->mode; assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME); assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME); @@ -5084,7 +5085,8 @@ static AOM_INLINE void search_intra_modes_in_interframe( int64_t intra_rd_y = INT64_MAX; const int is_luma_result_valid = av1_handle_intra_y_mode( intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx, - &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y); + &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y, + &best_model_rd, top_intra_model_rd); if (is_luma_result_valid && intra_rd_y < yrd_threshold) { is_best_y_mode_intra = 1; if (intra_rd_y < best_rd_y) { @@ -5147,12 +5149,6 @@ static AOM_INLINE void search_intra_modes_in_interframe( intra_rd_stats_uv.rate + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost); } - if (mode != DC_PRED && mode != PAETH_PRED) { - const int intra_cost_penalty = av1_get_intra_cost_penalty( - cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q, - cm->seq_params.bit_depth); - intra_rd_stats.rate += intra_cost_penalty; - } // Intra block is always coded as non-skip intra_rd_stats.skip_txfm = 0; @@ -5189,6 +5185,84 @@ static AOM_INLINE void search_intra_modes_in_interframe( } } +#if !CONFIG_REALTIME_ONLY +// Prepare inter_cost and intra_cost from TPL stats, which are used as ML +// features in intra mode pruning. +static AOM_INLINE void calculate_cost_from_tpl_data( + const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, + int mi_col, int64_t *inter_cost, int64_t *intra_cost) { + const AV1_COMMON *const cm = &cpi->common; + // Only consider full SB. + const BLOCK_SIZE sb_size = cm->seq_params->sb_size; + const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d; + const int len = (block_size_wide[sb_size] / tpl_bsize_1d) * + (block_size_high[sb_size] / tpl_bsize_1d); + SuperBlockEnc *sb_enc = &x->sb_enc; + if (sb_enc->tpl_data_count == len) { + const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d); + const int tpl_stride = sb_enc->tpl_stride; + const int tplw = mi_size_wide[tpl_bsize]; + const int tplh = mi_size_high[tpl_bsize]; + const int nw = mi_size_wide[bsize] / tplw; + const int nh = mi_size_high[bsize] / tplh; + if (nw >= 1 && nh >= 1) { + const int of_h = mi_row % mi_size_high[sb_size]; + const int of_w = mi_col % mi_size_wide[sb_size]; + const int start = of_h / tplh * tpl_stride + of_w / tplw; + + for (int k = 0; k < nh; k++) { + for (int l = 0; l < nw; l++) { + *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l]; + *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l]; + } + } + *inter_cost /= nw * nh; + *intra_cost /= nw * nh; + } + } +} +#endif // !CONFIG_REALTIME_ONLY + +// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune +// intra mode search. +static AOM_INLINE void skip_intra_modes_in_interframe( + AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize, + InterModeSearchState *search_state, int64_t inter_cost, int64_t intra_cost, + int skip_intra_in_interframe) { + MACROBLOCKD *const xd = &x->e_mbd; + if (inter_cost >= 0 && intra_cost >= 0) { + aom_clear_system_state(); + const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480) + ? &av1_intrap_nn_config + : &av1_intrap_hd_nn_config; + float nn_features[6]; + float scores[2] = { 0.0f }; + + nn_features[0] = (float)search_state->best_mbmode.skip_txfm; + nn_features[1] = (float)mi_size_wide_log2[bsize]; + nn_features[2] = (float)mi_size_high_log2[bsize]; + nn_features[3] = (float)intra_cost; + nn_features[4] = (float)inter_cost; + const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd); + nn_features[5] = (float)(ac_q_max / ac_q); + + av1_nn_predict(nn_features, nn_config, 1, scores); + aom_clear_system_state(); + + // For two parameters, the max prob returned from av1_nn_softmax equals + // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the + // calling of av1_nn_softmax. + const float thresh[2] = { 1.4f, 1.4f }; + if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) { + search_state->intra_search_state.skip_intra_modes = 1; + } + } else if ((search_state->best_mbmode.skip_txfm) && + (skip_intra_in_interframe >= 2)) { + search_state->intra_search_state.skip_intra_modes = 1; + } +} + // TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb. void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, struct RD_STATS *rd_cost, @@ -5231,6 +5305,7 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, -1, -1, -1, + { 0 }, { 0 } }; for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1; // Indicates the appropriate number of simple translation winner modes for @@ -5265,10 +5340,13 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, mbmi->partition != PARTITION_HORZ) || cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions >= 2) { picked_ref_frames_mask = - fetch_picked_ref_frames_mask(x, bsize, cm->seq_params.mib_size); + fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size); } } +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, set_params_rd_pick_inter_mode_time); +#endif // Skip ref frames that never selected by square blocks. const int skip_ref_frame_mask = picked_ref_frames_mask ? ~picked_ref_frames_mask : 0; @@ -5280,6 +5358,9 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask, skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, set_params_rd_pick_inter_mode_time); +#endif int64_t best_est_rd = INT64_MAX; const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; @@ -5292,6 +5373,10 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, cpi->sf.rt_sf.force_tx_search_off); InterModesInfo *inter_modes_info = x->inter_modes_info; inter_modes_info->num = 0; + int intra_mode_idx_ls[INTRA_MODES]; + for (i = 0; i < INTRA_MODES; ++i) { + intra_mode_idx_ls[i] = i + THR_DC; + } // Temporary buffers used by handle_inter_mode(). uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]); @@ -5337,40 +5422,13 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, const int do_pruning = (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1; if (do_pruning && sf->intra_sf.skip_intra_in_interframe && - cpi->oxcf.algo_cfg.enable_tpl_model) { - // Only consider full SB. - const BLOCK_SIZE sb_size = cm->seq_params.sb_size; - const int tpl_bsize_1d = cpi->tpl_data.tpl_bsize_1d; - const int len = (block_size_wide[sb_size] / tpl_bsize_1d) * - (block_size_high[sb_size] / tpl_bsize_1d); - SuperBlockEnc *sb_enc = &x->sb_enc; - if (sb_enc->tpl_data_count == len) { - const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d); - const int tpl_stride = sb_enc->tpl_stride; - const int tplw = mi_size_wide[tpl_bsize]; - const int tplh = mi_size_high[tpl_bsize]; - const int nw = mi_size_wide[bsize] / tplw; - const int nh = mi_size_high[bsize] / tplh; - if (nw >= 1 && nh >= 1) { - const int of_h = mi_row % mi_size_high[sb_size]; - const int of_w = mi_col % mi_size_wide[sb_size]; - const int start = of_h / tplh * tpl_stride + of_w / tplw; - - for (int k = 0; k < nh; k++) { - for (int l = 0; l < nw; l++) { - inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l]; - intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l]; - } - } - inter_cost /= nw * nh; - intra_cost /= nw * nh; - } - } - } + cpi->oxcf.algo_cfg.enable_tpl_model) + calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost, + &intra_cost); #endif // !CONFIG_REALTIME_ONLY // Initialize best mode stats for winner mode processing - av1_zero(x->winner_mode_stats); + av1_zero_array(x->winner_mode_stats, MAX_WINNER_MODE_COUNT_INTER); x->winner_mode_count = 0; store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize, best_rd_so_far, @@ -5389,20 +5447,20 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, skip_ref_frame_mask, 0, mode_thresh_mul_fact, - { 0 }, - 0, + intra_mode_idx_ls, 0, 0 }; int64_t best_inter_yrd = INT64_MAX; - // This is the main loop of this function. It loops over all possible modes - // and calls handle_inter_mode() to compute the RD for each. + // This is the main loop of this function. It loops over all possible inter + // modes and calls handle_inter_mode() to compute the RD for each. // Here midx is just an iterator index that should not be used by itself // except to keep track of the number of modes searched. It should be used // with av1_default_mode_order to get the enum that defines the mode, which // can be used with av1_mode_defs to get the prediction mode and the ref // frames. - for (THR_MODES midx = THR_MODE_START; midx < THR_MODE_END; ++midx) { + for (THR_MODES midx = THR_INTER_MODE_START; midx < THR_INTER_MODE_END; + ++midx) { // Get the actual prediction mode we are trying in this iteration const THR_MODES mode_enum = av1_default_mode_order[midx]; const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum]; @@ -5420,9 +5478,16 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, txfm_info->skip_txfm = 0; sf_args.num_single_modes_processed += is_single_pred; set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); - +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, skip_inter_mode_time); +#endif // Apply speed features to decide if this inter mode can be skipped - if (skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args)) continue; + const int is_skip_inter_mode = + skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, skip_inter_mode_time); +#endif + if (is_skip_inter_mode) continue; // Select prediction reference frames. for (i = 0; i < num_planes; i++) { @@ -5549,36 +5614,11 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, // Gate intra mode evaluation if best of inter is skip except when source // variance is extremely low const unsigned int src_var_thresh_intra_skip = 1; - if (sf->intra_sf.skip_intra_in_interframe && - (x->source_variance > src_var_thresh_intra_skip)) { - if (inter_cost >= 0 && intra_cost >= 0) { - aom_clear_system_state(); - const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480) - ? &av1_intrap_nn_config - : &av1_intrap_hd_nn_config; - float nn_features[6]; - float scores[2] = { 0.0f }; - float probs[2] = { 0.0f }; - - nn_features[0] = (float)search_state.best_mbmode.skip_txfm; - nn_features[1] = (float)mi_size_wide_log2[bsize]; - nn_features[2] = (float)mi_size_high_log2[bsize]; - nn_features[3] = (float)intra_cost; - nn_features[4] = (float)inter_cost; - const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); - const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd); - nn_features[5] = (float)(ac_q_max / ac_q); - - av1_nn_predict(nn_features, nn_config, 1, scores); - aom_clear_system_state(); - av1_nn_softmax(scores, probs, 2); - - if (probs[1] > 0.8) search_state.intra_search_state.skip_intra_modes = 1; - } else if ((search_state.best_mbmode.skip_txfm) && - (sf->intra_sf.skip_intra_in_interframe >= 2)) { - search_state.intra_search_state.skip_intra_modes = 1; - } - } + const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe; + if (skip_intra_in_interframe && + (x->source_variance > src_var_thresh_intra_skip)) + skip_intra_modes_in_interframe(cm, x, bsize, &search_state, inter_cost, + intra_cost, skip_intra_in_interframe); const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME]; search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx, @@ -5588,6 +5628,9 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, end_timing(cpi, handle_intra_mode_time); #endif +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, refine_winner_mode_tx_time); +#endif int winner_mode_count = cpi->sf.winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1; // In effect only when fast tx search speed features are enabled. @@ -5595,6 +5638,9 @@ void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data, cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index, &search_state.best_mbmode, yv12_mb, search_state.best_rate_y, search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, refine_winner_mode_tx_time); +#endif // Initialize default mode evaluation params set_mode_eval_params(cpi, x, DEFAULT_EVAL); @@ -5803,7 +5849,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, for (i = 0; i < SWITCHABLE_FILTERS; ++i) { mbmi->interp_filters = av1_broadcast_interp_filter(i); rs = av1_get_switchable_rate(x, xd, interp_filter, - cm->seq_params.enable_dual_filter); + cm->seq_params->enable_dual_filter); if (rs < best_rs) { best_rs = rs; best_filter = mbmi->interp_filters.as_filters.y_filter; @@ -5814,7 +5860,7 @@ void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, // Set the appropriate filter mbmi->interp_filters = av1_broadcast_interp_filter(best_filter); rate2 += av1_get_switchable_rate(x, xd, interp_filter, - cm->seq_params.enable_dual_filter); + cm->seq_params->enable_dual_filter); if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) rate2 += comp_inter_cost[comp_pred]; diff --git a/third_party/libaom/source/libaom/av1/encoder/rdopt.h b/third_party/libaom/source/libaom/av1/encoder/rdopt.h index 362da7b798..055a49e9f1 100644 --- a/third_party/libaom/source/libaom/av1/encoder/rdopt.h +++ b/third_party/libaom/source/libaom/av1/encoder/rdopt.h @@ -217,10 +217,10 @@ static INLINE int av1_encoder_get_relative_dist(int a, int b) { static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) { const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize]; int sb_mi_rows = - (mi_size_wide[cm->seq_params.sb_size] + mi_alloc_size_1d - 1) / + (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) / mi_alloc_size_1d; - assert(mi_size_wide[cm->seq_params.sb_size] == - mi_size_high[cm->seq_params.sb_size]); + assert(mi_size_wide[cm->seq_params->sb_size] == + mi_size_high[cm->seq_params->sb_size]); int sb_mi_size = sb_mi_rows * sb_mi_rows; return sb_mi_size; diff --git a/third_party/libaom/source/libaom/av1/encoder/rdopt_utils.h b/third_party/libaom/source/libaom/av1/encoder/rdopt_utils.h index ddd180f7ed..f00037992e 100644 --- a/third_party/libaom/source/libaom/av1/encoder/rdopt_utils.h +++ b/third_party/libaom/source/libaom/av1/encoder/rdopt_utils.h @@ -433,8 +433,10 @@ static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf, txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode; if (!winner_mode_tx_type_pruning) return; - const int prune_mode[2][2] = { { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 }, - { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 } }; + const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 }, + { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 }, + { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 }, + { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } }; txfm_params->prune_2d_txfm_mode = prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode]; } @@ -569,7 +571,7 @@ static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm, const MACROBLOCK *x) { const MACROBLOCKD *xd = &x->e_mbd; - if (cm->seq_params.monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED; + if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED; if (!xd->is_chroma_ref) { // For non-chroma-reference blocks, we should always store the luma pixels, diff --git a/third_party/libaom/source/libaom/av1/encoder/segmentation.c b/third_party/libaom/source/libaom/av1/encoder/segmentation.c index de17d571ff..edb6ef67fa 100644 --- a/third_party/libaom/source/libaom/av1/encoder/segmentation.c +++ b/third_party/libaom/source/libaom/av1/encoder/segmentation.c @@ -175,6 +175,14 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) { int no_pred_cost; int t_pred_cost = INT_MAX; int tile_col, tile_row, mi_row, mi_col; + + if (!seg->update_map) return; + if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) { + seg->temporal_update = 0; + assert(seg->update_data == 1); + return; + } + unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } }; unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 }; unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 }; @@ -194,15 +202,15 @@ void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) { tile_info.mi_row_start * cm->mi_params.mi_stride + tile_info.mi_col_start; for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end; - mi_row += cm->seq_params.mib_size, - mi_ptr += cm->seq_params.mib_size * cm->mi_params.mi_stride) { + mi_row += cm->seq_params->mib_size, + mi_ptr += cm->seq_params->mib_size * cm->mi_params.mi_stride) { MB_MODE_INFO **mi = mi_ptr; for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end; - mi_col += cm->seq_params.mib_size, - mi += cm->seq_params.mib_size) { + mi_col += cm->seq_params->mib_size, + mi += cm->seq_params->mib_size) { count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row, - mi_col, cm->seq_params.sb_size); + mi_col, cm->seq_params->sb_size); } } } diff --git a/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.c b/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.c index 1c556c2a09..dbfcaabbd6 100644 --- a/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.c +++ b/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.c @@ -8,7 +8,6 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include <float.h> #include "av1/common/av1_common_int.h" #include "av1/encoder/sparse_linear_solver.h" #include "config/aom_config.h" @@ -408,4 +407,4 @@ void av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl, aom_free(Ad); } -#endif // CONFIG_OPFL +#endif // CONFIG_OPTICAL_FLOW_API diff --git a/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.h b/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.h index 3cacb51b93..a3f2f7b964 100644 --- a/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.h +++ b/third_party/libaom/source/libaom/av1/encoder/sparse_linear_solver.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AV1_COMMON_SPARSE_LINEAR_SOLVER_H_ -#define AV1_COMMON_SPARSE_LINEAR_SOLVER_H_ +#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ +#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ #ifdef __cplusplus extern "C" { @@ -64,4 +64,4 @@ void av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl, } // extern "C" #endif -#endif /* AV1_COMMON_SPARSE_LINEAR_SOLVER_H_ */ +#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */ diff --git a/third_party/libaom/source/libaom/av1/encoder/speed_features.c b/third_party/libaom/source/libaom/av1/encoder/speed_features.c index 2244aaae91..916a818513 100644 --- a/third_party/libaom/source/libaom/av1/encoder/speed_features.c +++ b/third_party/libaom/source/libaom/av1/encoder/speed_features.c @@ -274,6 +274,20 @@ static void set_allintra_speed_feature_framesize_dependent( sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16; } + + if (speed >= 7) { + if (!is_480p_or_larger) { + sf->rt_sf.nonrd_check_partition_merge_mode = 2; + } + } + + if (speed >= 8) { + // TODO(kyslov): add more speed features to control speed/quality + } + + if (speed >= 9) { + // TODO(kyslov): add more speed features to control speed/quality + } } static void set_allintra_speed_features_framesize_independent( @@ -289,8 +303,11 @@ static void set_allintra_speed_features_framesize_independent( sf->part_sf.prune_part4_search = 2; sf->part_sf.simple_motion_search_prune_rect = 1; sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.use_best_rd_for_pruning = 1; sf->intra_sf.intra_pruning_with_hog = 1; + sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF; sf->tx_sf.adaptive_txb_search_level = 1; sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1; @@ -300,7 +317,7 @@ static void set_allintra_speed_features_framesize_independent( sf->rt_sf.use_nonrd_pick_mode = 0; sf->rt_sf.use_real_time_ref_set = 0; - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION || + if (cpi->ppi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION || cpi->use_screen_content_tools) { sf->mv_sf.exhaustive_searches_thresh = (1 << 20); } else { @@ -318,10 +335,12 @@ static void set_allintra_speed_features_framesize_independent( // speed feature accordingly sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2; sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3; + sf->part_sf.reuse_best_prediction_for_part_ab = 1; sf->mv_sf.exhaustive_searches_thresh <<= 1; sf->intra_sf.prune_palette_search_level = 1; + sf->intra_sf.top_intra_model_count_allowed = 3; sf->tx_sf.adaptive_txb_search_level = 2; sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; @@ -348,6 +367,7 @@ static void set_allintra_speed_features_framesize_independent( sf->intra_sf.disable_smooth_intra = 1; sf->intra_sf.intra_pruning_with_hog = 2; + sf->intra_sf.prune_filter_intra_level = 1; sf->rd_sf.perform_coeff_opt = 3; @@ -397,9 +417,6 @@ static void set_allintra_speed_features_framesize_independent( sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; - sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; sf->intra_sf.prune_chroma_modes_using_luma_winner = 1; sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL; @@ -408,7 +425,7 @@ static void set_allintra_speed_features_framesize_independent( sf->tpl_sf.subpel_force_stop = HALF_PEL; sf->tpl_sf.search_method = FAST_BIGDIA; - sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1; @@ -443,9 +460,10 @@ static void set_allintra_speed_features_framesize_independent( } if (speed >= 6) { - sf->intra_sf.disable_filter_intra = 1; + sf->intra_sf.prune_filter_intra_level = 2; sf->intra_sf.chroma_intra_pruning_with_hog = 4; sf->intra_sf.intra_pruning_with_hog = 4; + sf->intra_sf.cfl_search_range = 1; sf->part_sf.prune_rectangular_split_based_on_qidx = allow_screen_content_tools ? 0 : 1; @@ -458,7 +476,7 @@ static void set_allintra_speed_features_framesize_independent( sf->mv_sf.use_bsize_dependent_search_method = 1; - sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3; sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0; // Use largest txfm block size for square coding blocks. sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2; @@ -466,10 +484,39 @@ static void set_allintra_speed_features_framesize_independent( sf->rd_sf.perform_coeff_opt = 6; sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4; + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF; } + if (speed >= 7) { + sf->part_sf.default_min_partition_size = BLOCK_8X8; + sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + + sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; + + sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + sf->rt_sf.use_nonrd_pick_mode = 1; + sf->rt_sf.nonrd_check_partition_merge_mode = 1; + sf->rt_sf.nonrd_check_partition_split = 0; + sf->rt_sf.skip_intra_pred_if_tx_skip = 1; + // Set mask for intra modes. + for (int i = 0; i < BLOCK_SIZES; ++i) + if (i >= BLOCK_32X32) + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + else + // Use DC, H, V intra mode for block sizes < 32X32. + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V; + } + + if (speed >= 8) { + // TODO(kyslov): add more speed features to control speed/quality + } + + if (speed >= 9) { + // TODO(kyslov): add more speed features to control speed/quality + } + // Intra txb hash is currently not compatible with multi-winner mode as the // hashes got reset during multi-winner mode processing. assert(IMPLIES( @@ -480,6 +527,7 @@ static void set_allintra_speed_features_framesize_independent( static void set_good_speed_feature_framesize_dependent( const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { const AV1_COMMON *const cm = &cpi->common; + const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; @@ -518,7 +566,16 @@ static void set_good_speed_feature_framesize_dependent( sf->mv_sf.use_downsampled_sad = 1; } + if (!is_720p_or_larger) { + const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg; + const int rate_tolerance = + AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct); + sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2); + } + if (speed >= 1) { + if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1; + if (is_720p_or_larger) { sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128; } else if (is_480p_or_larger) { @@ -561,6 +618,12 @@ static void set_good_speed_feature_framesize_dependent( } if (is_480p_or_larger) { + sf->inter_sf.disable_interintra_wedge_var_thresh = 100; + } else { + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; + } + + if (is_480p_or_larger) { sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1; if (use_hbd) sf->tx_sf.prune_tx_size_level = 2; } else { @@ -573,6 +636,8 @@ static void set_good_speed_feature_framesize_dependent( } if (speed >= 3) { + sf->inter_sf.skip_newmv_in_drl = 2; + sf->part_sf.ml_early_term_after_part_split_level = 0; if (is_720p_or_larger) { @@ -584,6 +649,10 @@ static void set_good_speed_feature_framesize_dependent( sf->part_sf.partition_search_breakout_rate_thr = 120; } if (use_hbd) sf->tx_sf.prune_tx_size_level = 3; + + if (is_480p_or_larger) sf->intra_sf.top_intra_model_count_allowed = 2; + + sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; } if (speed >= 4) { @@ -598,11 +667,14 @@ static void set_good_speed_feature_framesize_dependent( } sf->inter_sf.prune_obmc_prob_thresh = INT_MAX; + if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3; if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 32; else sf->hl_sf.recode_tolerance = 55; + + sf->intra_sf.top_intra_model_count_allowed = 2; } if (speed >= 5) { @@ -612,6 +684,8 @@ static void set_good_speed_feature_framesize_dependent( sf->inter_sf.prune_warped_prob_thresh = 8; } if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40; + + sf->inter_sf.skip_newmv_in_drl = 4; } if (speed >= 6) { @@ -630,7 +704,9 @@ static void set_good_speed_feature_framesize_dependent( } if (!is_720p_or_larger) { - sf->inter_sf.mv_cost_upd_level = 2; + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET; + sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW; + sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW; } if (is_720p_or_larger) { @@ -650,10 +726,10 @@ static void set_good_speed_feature_framesize_dependent( static void set_good_speed_features_framesize_independent( const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { const AV1_COMMON *const cm = &cpi->common; - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int boosted = frame_is_boosted(cpi); const int is_boosted_arf2_bwd_type = - boosted || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE; + boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; const int allow_screen_content_tools = cm->features.allow_screen_content_tools; const int use_hbd = cpi->oxcf.use_highbitdepth; @@ -670,6 +746,8 @@ static void set_good_speed_features_framesize_independent( sf->part_sf.prune_part4_search = 2; sf->part_sf.simple_motion_search_prune_rect = 1; sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.use_best_rd_for_pruning = 1; // TODO(debargha): Test, tweak and turn on either 1 or 2 sf->inter_sf.inter_mode_rd_model_estimation = 1; @@ -698,7 +776,7 @@ static void set_good_speed_features_framesize_independent( sf->rt_sf.use_nonrd_pick_mode = 0; sf->rt_sf.use_real_time_ref_set = 0; - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION || + if (cpi->ppi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION || cpi->use_screen_content_tools) { sf->mv_sf.exhaustive_searches_thresh = (1 << 20); } else { @@ -725,7 +803,6 @@ static void set_good_speed_features_framesize_independent( sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS; sf->mv_sf.disable_extensive_joint_motion_search = 1; - sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1; sf->inter_sf.prune_comp_type_by_comp_avg = 1; sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1; @@ -736,7 +813,6 @@ static void set_good_speed_features_framesize_independent( sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3; sf->inter_sf.reuse_inter_intra_mode = 1; sf->inter_sf.selective_ref_frame = 2; - sf->inter_sf.skip_repeated_newmv = 1; sf->interp_sf.use_interp_filter = 1; @@ -766,7 +842,11 @@ static void set_good_speed_features_framesize_independent( if (speed >= 2) { sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; + sf->fp_sf.skip_motion_search_threshold = 25; + sf->part_sf.allow_partition_search_skip = 1; + sf->part_sf.reuse_best_prediction_for_part_ab = + !frame_is_intra_only(&cpi->common); sf->mv_sf.auto_mv_step_size = 1; sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL; @@ -778,20 +858,21 @@ static void set_good_speed_features_framesize_independent( // bit more closely to figure out why. sf->inter_sf.adaptive_rd_thresh = 1; sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL; - sf->inter_sf.disable_interintra_wedge_var_thresh = 100; sf->inter_sf.disable_interinter_wedge_var_thresh = 100; sf->inter_sf.fast_interintra_wedge_search = 1; sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1; - sf->inter_sf.prune_compound_using_neighbors = 1; + sf->inter_sf.prune_ext_comp_using_neighbors = 1; sf->inter_sf.prune_comp_using_best_single_mode_ref = 2; sf->inter_sf.prune_comp_type_by_comp_avg = 2; - sf->inter_sf.reuse_best_prediction_for_part_ab = 1; sf->inter_sf.selective_ref_frame = 3; sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; // Enable fast search only for COMPOUND_DIFFWTD type. sf->inter_sf.enable_fast_compound_mode_search = 1; sf->inter_sf.reuse_mask_search_results = 1; sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 1; + sf->inter_sf.disable_interinter_wedge_newmv_search = + is_boosted_arf2_bwd_type ? 0 : 1; + sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1; // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3 sf->interp_sf.adaptive_interp_filter_search = 1; @@ -831,7 +912,8 @@ static void set_good_speed_features_framesize_independent( sf->mv_sf.search_method = DIAMOND; sf->mv_sf.disable_second_mv = 2; - sf->inter_sf.mv_cost_upd_level = 1; + sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1; + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; sf->inter_sf.disable_onesided_comp = 1; // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine // it with cpi->sf.disable_wedge_search_var_thresh. @@ -843,10 +925,11 @@ static void set_good_speed_features_framesize_independent( sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2; sf->inter_sf.selective_ref_frame = 5; sf->inter_sf.skip_repeated_ref_mv = 1; - sf->inter_sf.skip_repeated_full_newmv = 1; sf->inter_sf.reuse_compound_type_decision = 1; sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2); + sf->inter_sf.enable_fast_wedge_mask_search = 1; + sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2; sf->interp_sf.adaptive_interp_filter_search = 2; @@ -865,6 +948,8 @@ static void set_good_speed_features_framesize_independent( sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3; sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2; sf->tx_sf.use_intra_txb_hash = 1; + sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1; // TODO(any): Refactor the code related to following winner mode speed // features @@ -874,10 +959,10 @@ static void set_good_speed_features_framesize_independent( frame_is_intra_only(&cpi->common) ? 0 : 1; sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1; sf->winner_mode_sf.motion_mode_for_winner_cand = - boosted - ? 0 - : gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE ? 1 - : 2; + boosted ? 0 + : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE + ? 1 + : 2; // TODO(any): evaluate if these lpf features can be moved to speed 2. // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality @@ -889,6 +974,8 @@ static void set_good_speed_features_framesize_independent( } if (speed >= 4) { + sf->gm_sf.prune_zero_mv_with_sse = 1; + sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; sf->part_sf.simple_motion_search_prune_agg = 2; @@ -901,7 +988,7 @@ static void set_good_speed_features_framesize_independent( sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 3; sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2; - sf->inter_sf.prune_compound_using_neighbors = 2; + sf->inter_sf.prune_ext_comp_using_neighbors = 2; sf->inter_sf.prune_obmc_prob_thresh = INT_MAX; sf->interp_sf.cb_pred_filter_search = 1; @@ -911,9 +998,10 @@ static void set_good_speed_features_framesize_independent( sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; - sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; + // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4. + // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; // TODO(any): Experiment with this speed feature set to 2 for higher quality // presets as well sf->intra_sf.skip_intra_in_interframe = 2; @@ -923,10 +1011,10 @@ static void set_good_speed_features_framesize_independent( sf->tpl_sf.prune_starting_mv = 2; sf->tpl_sf.subpel_force_stop = HALF_PEL; sf->tpl_sf.search_method = FAST_BIGDIA; + sf->tpl_sf.gop_length_decision_method = 1; - sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1; - sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3; sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1; // TODO(any): Experiment with enabling of this speed feature as hash state // is reset during winner mode processing @@ -948,9 +1036,14 @@ static void set_good_speed_features_framesize_independent( } if (speed >= 5) { + sf->fp_sf.reduce_mv_step_param = 4; + sf->part_sf.simple_motion_search_prune_agg = 3; sf->part_sf.ext_partition_eval_thresh = allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16; + sf->part_sf.prune_sub_8x8_partition_level = + (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0 + : 2; sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX; sf->inter_sf.prune_inter_modes_if_skippable = 1; @@ -974,8 +1067,11 @@ static void set_good_speed_features_framesize_independent( sf->tpl_sf.prune_starting_mv = 3; sf->tpl_sf.use_y_only_rate_distortion = 1; sf->tpl_sf.subpel_force_stop = FULL_PEL; + sf->tpl_sf.gop_length_decision_method = 2; sf->winner_mode_sf.dc_blk_pred_level = 1; + + sf->fp_sf.disable_recon = 1; } if (speed >= 6) { @@ -986,9 +1082,14 @@ static void set_good_speed_features_framesize_independent( sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3; sf->inter_sf.prune_nearmv_using_neighbors = 1; sf->inter_sf.selective_ref_frame = 6; + sf->inter_sf.prune_ext_comp_using_neighbors = 3; sf->intra_sf.chroma_intra_pruning_with_hog = 4; sf->intra_sf.intra_pruning_with_hog = 4; + sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC; + sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC; + sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC; sf->part_sf.prune_rectangular_split_based_on_qidx = boosted || allow_screen_content_tools ? 0 : 1; @@ -1000,10 +1101,10 @@ static void set_good_speed_features_framesize_independent( sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL; sf->mv_sf.use_bsize_dependent_search_method = 1; - sf->tpl_sf.disable_gop_length_decision = 1; + sf->tpl_sf.gop_length_decision_method = 3; sf->tpl_sf.disable_filtered_key_tpl = 1; - sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2; + sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4; sf->tx_sf.use_intra_txb_hash = 1; sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0; @@ -1052,10 +1153,13 @@ static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi, #endif } } else { - if (speed == 8 && !cpi->use_svc) { + if (speed == 8 && !cpi->ppi->use_svc) { sf->rt_sf.short_circuit_low_temp_var = 0; sf->rt_sf.use_nonrd_altref_frame = 1; } + if (speed >= 9) { + sf->rt_sf.skip_cdef_sb = 1; + } } if (!is_480p_or_larger) { if (speed == 7) { @@ -1088,6 +1192,8 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, sf->part_sf.less_rectangular_check_level = 1; sf->part_sf.ml_prune_partition = 1; sf->part_sf.prune_ext_partition_types_search_level = 1; + sf->part_sf.reuse_prev_rd_results_for_part_ab = 1; + sf->part_sf.use_best_rd_for_pruning = 1; // TODO(debargha): Test, tweak and turn on either 1 or 2 sf->inter_sf.inter_mode_rd_model_estimation = 0; @@ -1103,6 +1209,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, sf->interp_sf.use_fast_interpolation_filter_search = 1; + sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF; sf->intra_sf.intra_pruning_with_hog = 1; sf->mv_sf.full_pixel_search_level = 1; @@ -1140,7 +1247,6 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, sf->inter_sf.prune_comp_search_by_single_result = 1; sf->inter_sf.reuse_inter_intra_mode = 1; sf->inter_sf.selective_ref_frame = 2; - sf->inter_sf.skip_repeated_newmv = 1; sf->inter_sf.disable_interintra_wedge_var_thresh = 0; sf->inter_sf.disable_interinter_wedge_var_thresh = 0; sf->inter_sf.prune_comp_type_by_comp_avg = 1; @@ -1191,7 +1297,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, if (speed >= 3) { sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF; - sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; + sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2; sf->part_sf.less_rectangular_check_level = 2; @@ -1202,7 +1308,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, // sf->mv_sf.adaptive_motion_search = 1; sf->inter_sf.adaptive_rd_thresh = 2; - sf->inter_sf.mv_cost_upd_level = 1; + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine // it with cpi->sf.disable_wedge_search_var_thresh. sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX; @@ -1306,12 +1412,20 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, sf->part_sf.default_min_partition_size = BLOCK_8X8; sf->part_sf.partition_search_type = VAR_BASED_PARTITION; + sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH; + sf->mv_sf.search_method = FAST_DIAMOND; sf->mv_sf.subpel_force_stop = QUARTER_PEL; sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED; sf->inter_sf.inter_mode_rd_model_estimation = 2; + // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't + // good. May need more study. + for (int i = 0; i < TX_SIZES; ++i) { + sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL; + } + sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q; sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; @@ -1348,7 +1462,7 @@ static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, // TODO(marpan): Look into why enabling skip_loopfilter_non_reference is // not bitexact on rtc testset, its very close (< ~0.01 bdrate), but not // always bitexact. - if (cpi->use_svc && cpi->svc.non_reference_frame && + if (cpi->ppi->use_svc && cpi->svc.non_reference_frame && sf->lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q && sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) sf->rt_sf.skip_loopfilter_non_reference = 1; @@ -1398,8 +1512,14 @@ static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) { hl_sf->second_alt_ref_filtering = 1; } +static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) { + fp_sf->reduce_mv_step_param = 3; + fp_sf->skip_motion_search_threshold = 0; + fp_sf->disable_recon = 0; +} + static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) { - tpl_sf->disable_gop_length_decision = 0; + tpl_sf->gop_length_decision_method = 0; tpl_sf->prune_intra_modes = 0; tpl_sf->prune_starting_mv = 0; tpl_sf->reduce_first_step_size = 0; @@ -1415,6 +1535,7 @@ static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) { static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) { gm_sf->gm_search_type = GM_FULL_SEARCH; gm_sf->prune_ref_frame_for_gm_search = 0; + gm_sf->prune_zero_mv_with_sse = 0; } static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) { @@ -1454,6 +1575,9 @@ static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) { part_sf->ml_predict_breakout_level = 0; part_sf->prune_sub_8x8_partition_level = 0; part_sf->simple_motion_search_rect_split = 0; + part_sf->reuse_prev_rd_results_for_part_ab = 0; + part_sf->reuse_best_prediction_for_part_ab = 0; + part_sf->use_best_rd_for_pruning = 0; } static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) { @@ -1487,16 +1611,17 @@ static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) { inter_sf->fast_wedge_sign_estimate = 0; inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED; inter_sf->reuse_inter_intra_mode = 0; - inter_sf->mv_cost_upd_level = 0; + inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB; + inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB; + inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB; inter_sf->prune_inter_modes_based_on_tpl = 0; inter_sf->prune_nearmv_using_neighbors = 0; inter_sf->prune_comp_search_by_single_result = 0; inter_sf->skip_repeated_ref_mv = 0; - inter_sf->skip_repeated_newmv = 0; - inter_sf->skip_repeated_full_newmv = 0; + inter_sf->skip_newmv_in_drl = 0; inter_sf->inter_mode_rd_model_estimation = 0; inter_sf->prune_compound_using_single_ref = 0; - inter_sf->prune_compound_using_neighbors = 0; + inter_sf->prune_ext_comp_using_neighbors = 0; inter_sf->prune_comp_using_best_single_mode_ref = 0; inter_sf->disable_onesided_comp = 0; inter_sf->prune_mode_search_simple_translation = 0; @@ -1514,9 +1639,10 @@ static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) { inter_sf->txfm_rd_gate_level = 0; inter_sf->prune_inter_modes_if_skippable = 0; inter_sf->disable_masked_comp = 0; - inter_sf->reuse_best_prediction_for_part_ab = 0; inter_sf->enable_fast_compound_mode_search = 0; inter_sf->reuse_mask_search_results = 0; + inter_sf->enable_fast_wedge_mask_search = 0; + inter_sf->inter_mode_txfm_breakout = 0; } static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) { @@ -1529,6 +1655,7 @@ static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) { } static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) { + intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB; intra_sf->skip_intra_in_interframe = 1; intra_sf->intra_pruning_with_hog = 0; intra_sf->chroma_intra_pruning_with_hog = 0; @@ -1539,8 +1666,10 @@ static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) { intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL; } intra_sf->disable_smooth_intra = 0; - intra_sf->disable_filter_intra = 0; + intra_sf->prune_filter_intra_level = 0; intra_sf->prune_chroma_modes_using_luma_winner = 0; + intra_sf->cfl_search_range = 3; + intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT; } static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) { @@ -1650,9 +1779,11 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) { break; } - if (!cpi->seq_params_locked) { - cpi->common.seq_params.enable_masked_compound &= + if (!cpi->ppi->seq_params_locked) { + cpi->common.seq_params->enable_masked_compound &= !sf->inter_sf.disable_masked_comp; + cpi->common.seq_params->enable_interintra_compound &= + (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX); } // This is only used in motion vector unit test. @@ -1662,9 +1793,9 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) { cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv; if ((cpi->oxcf.row_mt == 1) && (cpi->oxcf.max_threads > 1)) { - if (sf->inter_sf.mv_cost_upd_level > 1) { + if (sf->inter_sf.mv_cost_upd_level < INTERNAL_COST_UPD_SBROW) { // Set mv_cost_upd_level to use row level update. - sf->inter_sf.mv_cost_upd_level = 1; + sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW; } } } @@ -1676,6 +1807,7 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) { int i; init_hl_sf(&sf->hl_sf); + init_fp_sf(&sf->fp_sf); init_tpl_sf(&sf->tpl_sf); init_gm_sf(&sf->gm_sf); init_part_sf(&sf->part_sf); @@ -1701,12 +1833,12 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) { break; } - if (!cpi->seq_params_locked) { - cpi->common.seq_params.enable_dual_filter &= + if (!cpi->ppi->seq_params_locked) { + cpi->common.seq_params->enable_dual_filter &= !sf->interp_sf.disable_dual_filter; - cpi->common.seq_params.enable_restoration &= !sf->lpf_sf.disable_lr_filter; + cpi->common.seq_params->enable_restoration &= !sf->lpf_sf.disable_lr_filter; - cpi->common.seq_params.enable_interintra_compound &= + cpi->common.seq_params->enable_interintra_compound &= (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX); } @@ -1821,10 +1953,11 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) { SPEED_FEATURES *const sf = &cpi->sf; WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params; const int boosted = frame_is_boosted(cpi); + const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080; const int is_arf2_bwd_type = - cpi->gf_group.update_type[cpi->gf_group.index] == INTNL_ARF_UPDATE; + cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE; if (cpi->oxcf.mode == REALTIME) return; @@ -1832,7 +1965,6 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) { // qindex_thresh for resolution < 720p const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140); if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) { - sf->inter_sf.skip_repeated_newmv = 1; sf->part_sf.simple_motion_search_split = cm->features.allow_screen_content_tools ? 1 : 2; sf->part_sf.simple_motion_search_early_term_none = 1; @@ -1849,7 +1981,6 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) { sf->tx_sf.inter_tx_size_search_init_depth_rect = 1; sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1; sf->tx_sf.intra_tx_size_search_init_depth_rect = 1; - sf->inter_sf.skip_repeated_newmv = 1; sf->tx_sf.model_based_prune_tx_search_level = 0; if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) { @@ -1866,28 +1997,25 @@ void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) { } } - if (speed >= 3) { - // Disable extended partitions for lower quantizers - const int qindex_thresh = - cm->features.allow_screen_content_tools ? 50 : 100; - if (cm->quant_params.base_qindex <= qindex_thresh && !boosted) { - sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; - } - } - - if (speed >= 4) { + if (speed >= 2) { // Disable extended partitions for lower quantizers - const int qindex_thresh = boosted ? 80 : 120; - if (cm->quant_params.base_qindex <= qindex_thresh && - !frame_is_intra_only(&cpi->common)) { - sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; + const int aggr = AOMMIN(3, speed - 2); + const int qindex_thresh1[4] = { 50, 50, 80, 100 }; + const int qindex_thresh2[4] = { 80, 100, 120, 160 }; + int qindex_thresh; + int disable_ext_part; + if (aggr <= 1) { + const int qthresh2 = + (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr]; + qindex_thresh = cm->features.allow_screen_content_tools + ? qindex_thresh1[aggr] + : qthresh2; + disable_ext_part = !boosted; + } else { + qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr]; + disable_ext_part = !frame_is_intra_only(cm); } - } - - if (speed >= 5) { - const int qindex_thresh = boosted ? 100 : 160; - if (cm->quant_params.base_qindex <= qindex_thresh && - !frame_is_intra_only(&cpi->common)) { + if (cm->quant_params.base_qindex <= qindex_thresh && disable_ext_part) { sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128; } } diff --git a/third_party/libaom/source/libaom/av1/encoder/speed_features.h b/third_party/libaom/source/libaom/av1/encoder/speed_features.h index 90765febfb..3cf4c3d10b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/speed_features.h +++ b/third_party/libaom/source/libaom/av1/encoder/speed_features.h @@ -287,17 +287,30 @@ enum { SUPERRES_AUTO_DUAL, // Tries no superres and q-based superres ratios SUPERRES_AUTO_SOLO, // Only apply the q-based superres ratio } UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE); - /*!\endcond */ + +/*!\enum INTERNAL_COST_UPDATE_TYPE + * \brief This enum decides internally how often to update the entropy costs + * + * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly + * more flexibility in update frequency. This enum is separate from \ref + * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its + * values are public so it cannot be modified without breaking public API. + */ +typedef enum { + INTERNAL_COST_UPD_OFF, /*!< Turn off cost updates. */ + INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */ + INTERNAL_COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */ + INTERNAL_COST_UPD_SB, /*!< Update every sb. */ +} INTERNAL_COST_UPDATE_TYPE; + /*! * \brief Sequence/frame level speed vs quality features */ typedef struct HIGH_LEVEL_SPEED_FEATURES { - /*!\cond */ - // Frame level coding parameter update + /*! Frame level coding parameter update. */ int frame_parameter_update; - /*!\endcond */ /*! * Cases and frame types for which the recode loop is enabled. */ @@ -309,25 +322,27 @@ typedef struct HIGH_LEVEL_SPEED_FEATURES { */ int recode_tolerance; - /*!\cond */ - // Determine how motion vector precision is chosen. The possibilities are: - // LAST_MV_DATA: use the mv data from the last coded frame - // CURRENT_Q: use the current q as a threshold - // QTR_ONLY: use quarter pel precision only. + /*! + * Determine how motion vector precision is chosen. The possibilities are: + * LAST_MV_DATA: use the mv data from the last coded frame + * CURRENT_Q: use the current q as a threshold + * QTR_ONLY: use quarter pel precision only. + */ MV_PREC_LOGIC high_precision_mv_usage; - // Always set to 0. If on it enables 0 cost background transmission - // (except for the initial transmission of the segmentation). The feature is - // disabled because the addition of very large block sizes make the - // backgrounds very to cheap to encode, and the segmentation we have - // adds overhead. + /*! + * Always set to 0. If on it enables 0 cost background transmission + * (except for the initial transmission of the segmentation). The feature is + * disabled because the addition of very large block sizes make the + * backgrounds very to cheap to encode, and the segmentation we have + * adds overhead. + */ int static_segmentation; /*! * Superres-auto mode search type: */ SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type; - /*!\endcond */ /*! * Enable/disable extra screen content test by encoding key frame twice. @@ -340,10 +355,39 @@ typedef struct HIGH_LEVEL_SPEED_FEATURES { int second_alt_ref_filtering; } HIGH_LEVEL_SPEED_FEATURES; +/*! + * Speed features for the first pass. + */ +typedef struct FIRST_PASS_SPEED_FEATURES { + /*! + * \brief Reduces the mv search window. + * By default, the initial search window is around + * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023). + * Each step reduction decrease the window size by about a factor of 2. + */ + int reduce_mv_step_param; + + /*! + * \brief Skips the motion search when the zero mv has small sse. + */ + int skip_motion_search_threshold; + + /*! + * \brief Skips reconstruction by using source buffers for prediction + */ + int disable_recon; +} FIRST_PASS_SPEED_FEATURES; + /*!\cond */ typedef struct TPL_SPEED_FEATURES { - // Enable/disable GOP length adaptive decision. - int disable_gop_length_decision; + // GOP length adaptive decision. + // If set to 0, tpl model decides whether a shorter gf interval is better. + // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and + // (base+2) layer decide whether a shorter gf interval is better. + // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost + // decide whether a shorter gf interval is better. + // If set to 3, gop length adaptive decision is disabled. + int gop_length_decision_method; // Prune the intra modes search by tpl. // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED. // If set to 1, we only search DC_PRED, V_PRED, and H_PRED. @@ -387,6 +431,10 @@ typedef struct GLOBAL_MOTION_SPEED_FEATURES { // given direction(past/future), if the evaluated ref_frame in that direction // yields gm_type as INVALID/TRANSLATION/IDENTITY int prune_ref_frame_for_gm_search; + + // When the current GM type is set to ZEROMV, prune ZEROMV if its performance + // is worse than NEWMV under SSE metric. + int prune_zero_mv_with_sse; } GLOBAL_MOTION_SPEED_FEATURES; typedef struct PARTITION_SPEED_FEATURES { @@ -511,6 +559,53 @@ typedef struct PARTITION_SPEED_FEATURES { // Prune rectangular split based on simple motion search split/no_split score. // 0: disable pruning, 1: enable pruning int simple_motion_search_rect_split; + + // The current encoder adopts a DFS search for block partitions. + // Therefore the mode selection and associated rdcost is ready for smaller + // blocks before the mode selection for some partition types. + // AB partition could use previous rd information and skip mode search. + // An example is: + // + // current block + // +---+---+ + // | | + // + + + // | | + // +-------+ + // + // SPLIT partition has been searched first before trying HORZ_A + // +---+---+ + // | R | R | + // +---+---+ + // | R | R | + // +---+---+ + // + // HORZ_A + // +---+---+ + // | | | + // +---+---+ + // | | + // +-------+ + // + // With this speed feature, the top two sub blocks can directly use rdcost + // searched in split partition, and the mode info is also copied from + // saved info. Similarly, the bottom rectangular block can also use + // the available information from previous rectangular search. + int reuse_prev_rd_results_for_part_ab; + + // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT + // when encoding PARTITION_AB. + int reuse_best_prediction_for_part_ab; + + // The current partition search records the best rdcost so far and uses it + // in mode search and transform search to early skip when some criteria is + // met. For example, when the current rdcost is larger than the best rdcost, + // or the model rdcost is larger than the best rdcost times some thresholds. + // By default, this feature is turned on to speed up the encoder partition + // search. + // If disabling it, at speed 0, 30 frames, we could get + // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown. + int use_best_rd_for_pruning; } PARTITION_SPEED_FEATURES; typedef struct MV_SPEED_FEATURES { @@ -621,16 +716,19 @@ typedef struct INTER_MODE_SPEED_FEATURES { int alt_ref_search_fp; - // flag to skip NEWMV mode in drl if the motion search result is the same - int skip_repeated_newmv; - - // Skip the current ref_mv in NEW_MV mode if we have already encountered - // another ref_mv in the drl such that: - // 1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION - // search process as the current fullpel_mv. - // 2. The rate needed to encode the current fullpel_mv is larger than that - // for the other ref_mv. - int skip_repeated_full_newmv; + // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc. + // This speed feature equaling 0 means no skipping. + // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode + // if we have already encountered ref_mv in the drl such that: + // 1. The other drl has the same mv during the SIMPLE_TRANSLATION search + // process as the current mv. + // 2. The rate needed to encode the current mv is larger than that for the + // other ref_mv. + // The speed feature equaling 1 means using subpel mv in the comparison. + // The speed feature equaling 2 means using fullpel mv in the comparison. + // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on + // known full_mv bestsme and drl cost. + int skip_newmv_in_drl; // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV, // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found @@ -677,12 +775,14 @@ typedef struct INTER_MODE_SPEED_FEATURES { // the single reference modes, it is one of the two best performers. int prune_compound_using_single_ref; - // Skip extended compound mode using ref frames of above and left neighbor + // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV, + // NEW_NEARMV) using ref frames of above and left neighbor // blocks. // 0 : no pruning - // 1 : prune extended compound mode (less aggressiveness) - // 2 : prune extended compound mode (high aggressiveness) - int prune_compound_using_neighbors; + // 1 : prune ext compound modes using neighbor blocks (less aggressiveness) + // 2 : prune ext compound modes using neighbor blocks (high aggressiveness) + // 3 : prune ext compound modes unconditionally (highest aggressiveness) + int prune_ext_comp_using_neighbors; // Skip extended compound mode when ref frame corresponding to NEWMV does not // have NEWMV as single mode winner. @@ -722,12 +822,15 @@ typedef struct INTER_MODE_SPEED_FEATURES { // Decide when and how to use joint_comp. DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag; - // To skip cost update for mv. - // mv_cost_upd_level indicates the aggressiveness of skipping. - // 0: update happens at each sb level. - // 1: update happens once for each sb row. - // 2: update happens once for a set of rows. - int mv_cost_upd_level; + // Clip the frequency of updating the mv cost. + INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level; + + // Clip the frequency of updating the coeff cost. + INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level; + + // Clip the frequency of updating the mode cost. + INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level; + // Prune inter modes based on tpl stats // 0 : no pruning // 1 - 3 indicate increasing aggressiveness in order. @@ -750,15 +853,17 @@ typedef struct INTER_MODE_SPEED_FEATURES { // Enable/disable masked compound. int disable_masked_comp; - // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT - // when encoding PARTITION_AB. - int reuse_best_prediction_for_part_ab; - // Enable/disable the fast compound mode search. int enable_fast_compound_mode_search; // Reuse masked compound type search results int reuse_mask_search_results; + + // Enable/disable fast search for wedge masks + int enable_fast_wedge_mask_search; + + // Early breakout from transform search of inter modes + int inter_mode_txfm_breakout; } INTER_MODE_SPEED_FEATURES; typedef struct INTERP_FILTER_SPEED_FEATURES { @@ -808,8 +913,11 @@ typedef struct INTRA_MODE_SPEED_FEATURES { // Enable/disable smooth intra modes. int disable_smooth_intra; - // Enable/disable filter intra modes. - int disable_filter_intra; + // Prune filter intra modes in intra frames. + // 0 : No pruning + // 1 : Evaluate applicable filter intra modes based on best intra mode so far + // 2 : Do not evaluate filter intra modes + int prune_filter_intra_level; // prune palette search // 0: No pruning @@ -825,6 +933,27 @@ typedef struct INTRA_MODE_SPEED_FEATURES { // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED, // UV_CFL_PRED and the mode that corresponds to luma intra mode winner. int prune_chroma_modes_using_luma_winner; + + // Clip the frequency of updating the mv cost for intrabc. + INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level; + + // We use DCT_DCT transform followed by computing SATD (Sum of Absolute + // Transformed Differences) as an estimation of RD score to quickly find the + // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search + // near the best possible parameter. The search range is set here. + // The range of cfl_searh_range should be [1, 33], and the following are the + // recommended values. + // 1: Fastest mode. + // 3: Default mode that provides good speedup without losing compression + // performance at speed 0. + // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only + // be used for debugging purpose. + int cfl_search_range; + + // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in + // intra mode decision. Here, add a speed feature to reduce this number for + // higher speeds. + int top_intra_model_count_allowed; } INTRA_MODE_SPEED_FEATURES; typedef struct TX_SPEED_FEATURES { @@ -1082,6 +1211,11 @@ typedef struct REAL_TIME_SPEED_FEATURES { // Skips mode checks more agressively in nonRD mode int nonrd_agressive_skip; + + // Skip cdef on 64x64 blocks when NEWMV or INTRA is not picked or color + // sensitivity is off. When color sensitivity is on for a superblock, all + // 64x64 blocks within will not skip. + int skip_cdef_sb; } REAL_TIME_SPEED_FEATURES; /*!\endcond */ @@ -1096,6 +1230,11 @@ typedef struct SPEED_FEATURES { HIGH_LEVEL_SPEED_FEATURES hl_sf; /*! + * Speed features for the first pass. + */ + FIRST_PASS_SPEED_FEATURES fp_sf; + + /*! * Speed features related to how tpl's searches are done. */ TPL_SPEED_FEATURES tpl_sf; diff --git a/third_party/libaom/source/libaom/av1/encoder/superres_scale.c b/third_party/libaom/source/libaom/av1/encoder/superres_scale.c index bcd3fefdfe..283faabe61 100644 --- a/third_party/libaom/source/libaom/av1/encoder/superres_scale.c +++ b/third_party/libaom/source/libaom/av1/encoder/superres_scale.c @@ -80,7 +80,7 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) { if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR; uint8_t new_denom = SCALE_NUMERATOR; - if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR; + if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR; switch (resize_cfg->resize_mode) { case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break; case RESIZE_FIXED: @@ -109,12 +109,13 @@ int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) { #define SUPERRES_ENERGY_BY_AC_THRESH 0.2 static double get_energy_by_q2_thresh(const GF_GROUP *gf_group, - const RATE_CONTROL *rc) { + const RATE_CONTROL *rc, + int gf_frame_index) { // TODO(now): Return keyframe thresh * factor based on frame type / pyramid // level. - if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) { return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME; - } else if (gf_group->update_type[gf_group->index] == KF_UPDATE) { + } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) { if (rc->frames_to_key <= 1) return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO; else @@ -142,15 +143,15 @@ static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy, static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex, int sr_kf, int sr_arf) { // Use superres for Key-frames and Alt-ref frames only. - const GF_GROUP *gf_group = &cpi->gf_group; - if (gf_group->update_type[gf_group->index] != KF_UPDATE && - gf_group->update_type[gf_group->index] != ARF_UPDATE) { + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE && + gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) { return SCALE_NUMERATOR; } - if (gf_group->update_type[gf_group->index] == KF_UPDATE && !sr_kf) { + if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) { return SCALE_NUMERATOR; } - if (gf_group->update_type[gf_group->index] == ARF_UPDATE && !sr_arf) { + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) { return SCALE_NUMERATOR; } @@ -158,7 +159,7 @@ static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex, analyze_hor_freq(cpi, energy); const double energy_by_q2_thresh = - get_energy_by_q2_thresh(gf_group, &cpi->rc); + get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index); int denom = get_superres_denom_from_qindex_energy( qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH); /* @@ -166,8 +167,8 @@ static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex, for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]); printf("]\n"); printf("boost = %d\n", - (gf_group->update_type[gf_group->index] == KF_UPDATE) - ? cpi->rc.kf_boost + (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE) + ? cpi->ppi->p_rc.kf_boost : cpi->rc.gfu_boost); printf("denom = %d\n", denom); */ @@ -194,8 +195,8 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { // Make sure that superres mode of the frame is consistent with the // sequence-level flag. assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE, - cpi->common.seq_params.enable_superres)); - assert(IMPLIES(!cpi->common.seq_params.enable_superres, + cpi->common.seq_params->enable_superres)); + assert(IMPLIES(!cpi->common.seq_params->enable_superres, superres_cfg->superres_mode == AOM_SUPERRES_NONE)); // Make sure that superres mode for current encoding is consistent with user // provided superres mode. @@ -222,8 +223,8 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { // Now decide the use of superres based on 'q'. int bottom_index, top_index; const int q = av1_rc_pick_q_and_bounds( - cpi, &cpi->rc, frm_dim_cfg->width, frm_dim_cfg->height, - cpi->gf_group.index, &bottom_index, &top_index); + cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index, + &bottom_index, &top_index); const int qthresh = (frame_is_intra_only(&cpi->common)) ? superres_cfg->superres_kf_qthresh @@ -243,8 +244,8 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { // Now decide the use of superres based on 'q'. int bottom_index, top_index; const int q = av1_rc_pick_q_and_bounds( - cpi, &cpi->rc, frm_dim_cfg->width, frm_dim_cfg->height, - cpi->gf_group.index, &bottom_index, &top_index); + cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index, + &bottom_index, &top_index); const SUPERRES_AUTO_SEARCH_TYPE sr_search_type = cpi->sf.hl_sf.superres_auto_search_type; @@ -345,7 +346,7 @@ static size_params_type calculate_next_size_params(AV1_COMP *cpi) { size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height, SCALE_NUMERATOR }; int resize_denom = SCALE_NUMERATOR; - if (has_no_stats_stage(cpi) && cpi->use_svc && + if (has_no_stats_stage(cpi) && cpi->ppi->use_svc && cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) { rsz.resize_width = cpi->common.width; rsz.resize_height = cpi->common.height; diff --git a/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.c b/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.c index 17109201e6..5cff958a85 100644 --- a/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.c +++ b/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.c @@ -30,6 +30,7 @@ void av1_init_layer_context(AV1_COMP *const cpi) { svc->current_superframe = 0; svc->force_zero_mode_spatial_ref = 1; svc->num_encoded_top_layer = 0; + svc->use_flexible_mode = 0; for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { @@ -90,6 +91,7 @@ void av1_init_layer_context(AV1_COMP *const cpi) { void av1_update_layer_context_change_config(AV1_COMP *const cpi, const int64_t target_bandwidth) { const RATE_CONTROL *const rc = &cpi->rc; + const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; SVC *const svc = &cpi->svc; int layer = 0; int64_t spatial_layer_target = 0; @@ -106,17 +108,18 @@ void av1_update_layer_context_change_config(AV1_COMP *const cpi, LAYER_CONTEXT *const lc = &svc->layer_context[sl * svc->number_temporal_layers + tl]; RATE_CONTROL *const lrc = &lc->rc; + PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc; lc->spatial_layer_target_bandwidth = spatial_layer_target; bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; - lrc->starting_buffer_level = - (int64_t)(rc->starting_buffer_level * bitrate_alloc); - lrc->optimal_buffer_level = - (int64_t)(rc->optimal_buffer_level * bitrate_alloc); - lrc->maximum_buffer_size = - (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + lp_rc->starting_buffer_level = + (int64_t)(p_rc->starting_buffer_level * bitrate_alloc); + lp_rc->optimal_buffer_level = + (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc); + lp_rc->maximum_buffer_size = + (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc); lrc->bits_off_target = - AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size); - lrc->buffer_level = AOMMIN(lrc->buffer_level, lrc->maximum_buffer_size); + AOMMIN(lrc->bits_off_target, lp_rc->maximum_buffer_size); + lrc->buffer_level = AOMMIN(lrc->buffer_level, lp_rc->maximum_buffer_size); lc->framerate = cpi->framerate / lc->framerate_factor; lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; @@ -164,7 +167,6 @@ void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) { } void av1_restore_layer_context(AV1_COMP *const cpi) { - GF_GROUP *const gf_group = &cpi->gf_group; SVC *const svc = &cpi->svc; const AV1_COMMON *const cm = &cpi->common; LAYER_CONTEXT *const lc = get_layer_context(cpi); @@ -172,8 +174,9 @@ void av1_restore_layer_context(AV1_COMP *const cpi) { const int old_frame_to_key = cpi->rc.frames_to_key; // Restore layer rate control. cpi->rc = lc->rc; + cpi->ppi->p_rc = lc->p_rc; cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth; - gf_group->index = 0; + cpi->gf_frame_index = 0; cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude; if (cpi->mv_search_params.max_mv_magnitude == 0) cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height); @@ -198,7 +201,7 @@ void av1_restore_layer_context(AV1_COMP *const cpi) { // This is to skip searching mv for that reference if it was last // refreshed (i.e., buffer slot holding that reference was refreshed) on the // previous spatial layer(s) at the same time (current_superframe). - if (svc->external_ref_frame_config && svc->force_zero_mode_spatial_ref) { + if (svc->set_ref_frame_config && svc->force_zero_mode_spatial_ref) { int ref_frame_idx = svc->ref_idx[LAST_FRAME - 1]; if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe && svc->buffer_spatial_layer[ref_frame_idx] <= svc->spatial_layer_id - 1) @@ -211,13 +214,13 @@ void av1_restore_layer_context(AV1_COMP *const cpi) { } void av1_save_layer_context(AV1_COMP *const cpi) { - GF_GROUP *const gf_group = &cpi->gf_group; SVC *const svc = &cpi->svc; const AV1_COMMON *const cm = &cpi->common; LAYER_CONTEXT *lc = get_layer_context(cpi); lc->rc = cpi->rc; + lc->p_rc = cpi->ppi->p_rc; lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth; - lc->group_index = gf_group->index; + lc->group_index = cpi->gf_frame_index; lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude; if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, @@ -243,7 +246,7 @@ void av1_save_layer_context(AV1_COMP *const cpi) { svc->buffer_time_index[i] = svc->current_superframe; svc->buffer_spatial_layer[i] = svc->spatial_layer_id; } - } else if (cpi->svc.external_ref_frame_config) { + } else if (cpi->svc.set_ref_frame_config) { for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) { int ref_frame_map_idx = svc->ref_idx[i]; if (cpi->svc.refresh[ref_frame_map_idx]) { @@ -342,3 +345,171 @@ void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) { cpi->common.height = height; av1_update_frame_size(cpi); } + +enum { + SVC_LAST_FRAME = 0, + SVC_LAST2_FRAME, + SVC_LAST3_FRAME, + SVC_GOLDEN_FRAME, + SVC_BWDREF_FRAME, + SVC_ALTREF2_FRAME, + SVC_ALTREF_FRAME +}; + +// For fixed svc mode: fixed pattern is set based on the number of +// spatial and temporal layers, and the ksvc_fixed_mode. +void av1_set_svc_fixed_mode(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int i; + assert(svc->use_flexible_mode == 0); + // Fixed SVC mode only supports at most 3 spatial or temporal layers. + assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 && + svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3); + svc->set_ref_frame_config = 1; + int superframe_cnt = svc->current_superframe; + // Set the reference map buffer idx for the 7 references: + // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), + // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = i; + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->reference[i] = 0; + for (i = 0; i < REF_FRAMES; i++) svc->refresh[i] = 0; + // Always reference LAST, and reference GOLDEN on SL > 0. + // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later + // when frame_type is set. + svc->reference[SVC_LAST_FRAME] = 1; + if (svc->spatial_layer_id > 0) svc->reference[SVC_GOLDEN_FRAME] = 1; + if (svc->temporal_layer_id == 0) { + // Base temporal layer. + if (svc->spatial_layer_id == 0) { + // Set all buffer_idx to 0. Update slot 0 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0; + svc->refresh[0] = 1; + } else if (svc->spatial_layer_id == 1) { + // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to + // slot 0. Update slot 1 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0; + svc->ref_idx[SVC_LAST_FRAME] = 1; + svc->refresh[1] = 1; + } else if (svc->spatial_layer_id == 2) { + // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to + // slot 1. Update slot 2 (LAST). + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 1; + svc->ref_idx[SVC_LAST_FRAME] = 2; + svc->refresh[2] = 1; + } + } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) { + // First top temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Reference LAST (slot 0). + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to slot 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + svc->ref_idx[SVC_GOLDEN_FRAME] = 3; + svc->refresh[3] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 3. + // Set LAST2 to slot 4 and Update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 3; + svc->ref_idx[SVC_LAST_FRAME] = 1; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + svc->ref_idx[SVC_LAST2_FRAME] = 4; + svc->refresh[4] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 4. + // No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 4; + svc->ref_idx[SVC_LAST_FRAME] = 2; + } + } else if (svc->temporal_layer_id == 1) { + // Middle temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Reference LAST. + // Set all buffer_idx to 0. + // Set GOLDEN to slot 5 and update slot 5. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1) { + svc->ref_idx[SVC_GOLDEN_FRAME] = 5; + svc->refresh[5] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, + // GOLDEN (and all other refs) to slot 5. + // Set LAST3 to slot 6 and update slot 6. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 5; + svc->ref_idx[SVC_LAST_FRAME] = 1; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1) { + svc->ref_idx[SVC_LAST3_FRAME] = 6; + svc->refresh[6] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2, + // GOLDEN (and all other refs) to slot 6. + // Set LAST3 to slot 7 and update slot 7. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 6; + svc->ref_idx[SVC_LAST_FRAME] = 2; + if (svc->temporal_layer_id < svc->number_temporal_layers - 1) { + svc->ref_idx[SVC_LAST3_FRAME] = 7; + svc->refresh[7] = 1; + } + } + } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) { + // Second top temporal enhancement layer. + if (svc->spatial_layer_id == 0) { + // Set LAST to slot 5 and reference LAST. + // Set GOLDEN to slot 3 and update slot 3. + // Set all other buffer_idx to 0. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0; + svc->ref_idx[SVC_LAST_FRAME] = 5; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + svc->ref_idx[SVC_GOLDEN_FRAME] = 3; + svc->refresh[3] = 1; + } + } else if (svc->spatial_layer_id == 1) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, + // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0; + svc->ref_idx[SVC_LAST_FRAME] = 6; + svc->ref_idx[SVC_GOLDEN_FRAME] = 3; + if (svc->spatial_layer_id < svc->number_spatial_layers - 1) { + svc->ref_idx[SVC_LAST2_FRAME] = 4; + svc->refresh[4] = 1; + } + } else if (svc->spatial_layer_id == 2) { + // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7, + // GOLDEN to slot 4. No update. + for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = 0; + svc->ref_idx[SVC_LAST_FRAME] = 7; + svc->ref_idx[SVC_GOLDEN_FRAME] = 4; + } + } +} + +void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) { + SVC *const svc = &cpi->svc; + for (int sl = 0; sl < svc->number_spatial_layers; ++sl) { + // Check for reset based on avg_frame_bandwidth for spatial layer sl. + int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + if (lrc->avg_frame_bandwidth > (3 * lrc->prev_avg_frame_bandwidth >> 1) || + lrc->avg_frame_bandwidth < (lrc->prev_avg_frame_bandwidth >> 1)) { + // Reset for all temporal layers with spatial layer sl. + for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc2 = &svc->layer_context[layer2]; + RATE_CONTROL *lrc2 = &lc2->rc; + PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc; + lrc2->rc_1_frame = 0; + lrc2->rc_2_frame = 0; + lrc2->bits_off_target = lp_rc->optimal_buffer_level; + lrc2->buffer_level = lp_rc->optimal_buffer_level; + } + } + } +} diff --git a/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.h b/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.h index 1eeba5e273..817e3620b0 100644 --- a/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.h +++ b/third_party/libaom/source/libaom/av1/encoder/svc_layercontext.h @@ -26,6 +26,7 @@ extern "C" { typedef struct { /*!\cond */ RATE_CONTROL rc; + PRIMARY_RATE_CONTROL p_rc; int framerate_factor; int64_t layer_target_bitrate; int scaling_factor_num; @@ -94,8 +95,10 @@ typedef struct SVC { int temporal_layer_id; int number_spatial_layers; int number_temporal_layers; - int external_ref_frame_config; + int set_ref_frame_config; int non_reference_frame; + int use_flexible_mode; + int ksvc_fixed_mode; /*!\endcond */ /*! @@ -271,6 +274,11 @@ int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi); void av1_get_layer_resolution(const int width_org, const int height_org, const int num, const int den, int *width_out, int *height_out); + +void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi); + +void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/libaom/source/libaom/av1/encoder/temporal_filter.c b/third_party/libaom/source/libaom/av1/encoder/temporal_filter.c index 676e110e60..6833ac8a40 100644 --- a/third_party/libaom/source/libaom/av1/encoder/temporal_filter.c +++ b/third_party/libaom/source/libaom/av1/encoder/temporal_filter.c @@ -155,7 +155,7 @@ static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb, best_mv.as_mv.row = GET_MV_SUBPEL(mv_row); best_mv.as_mv.col = GET_MV_SUBPEL(mv_col); const int mv_offset = mv_row * y_stride + mv_col; - error = cpi->fn_ptr[block_size].vf( + error = cpi->ppi->fn_ptr[block_size].vf( ref_frame->y_buffer + y_offset + mv_offset, y_stride, frame_to_filter->y_buffer + y_offset, y_stride, &sse); block_mse = DIVIDE_AND_ROUND(error, mb_pels); @@ -561,9 +561,16 @@ void av1_apply_temporal_filter_c( (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; // Decay factors for non-local mean approach. double decay_factor[MAX_MB_PLANE] = { 0 }; - // Smaller q -> smaller filtering weight. + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); @@ -745,10 +752,19 @@ static void tf_normalize_filtered_frame( } int av1_get_q(const AV1_COMP *cpi) { - const GF_GROUP *gf_group = &cpi->gf_group; - const FRAME_TYPE frame_type = gf_group->frame_type[gf_group->index]; - const int q = (int)av1_convert_qindex_to_q( - cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth); + const GF_GROUP *gf_group = &cpi->ppi->gf_group; + const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index]; + int avg_frame_qindex; +#if CONFIG_FRAME_PARALLEL_ENCODE + avg_frame_qindex = + (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) + ? cpi->ppi->temp_avg_frame_qindex[frame_type] + : cpi->rc.avg_frame_qindex[frame_type]; +#else + avg_frame_qindex = cpi->rc.avg_frame_qindex[frame_type]; +#endif // CONFIG_FRAME_PARALLEL_ENCODE + const int q = (int)av1_convert_qindex_to_q(avg_frame_qindex, + cpi->common.seq_params->bit_depth); return q; } @@ -855,23 +871,24 @@ void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) { } } tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes, - accum, count, &cpi->alt_ref_buffer); + accum, count, &cpi->ppi->alt_ref_buffer); if (check_show_existing) { const int y_height = mb_height >> mbd->plane[0].subsampling_y; const int y_width = mb_width >> mbd->plane[0].subsampling_x; const int source_y_stride = frame_to_filter->y_stride; - const int filter_y_stride = cpi->alt_ref_buffer.y_stride; + const int filter_y_stride = cpi->ppi->alt_ref_buffer.y_stride; const int source_offset = mb_row * y_height * source_y_stride + mb_col * y_width; const int filter_offset = mb_row * y_height * filter_y_stride + mb_col * y_width; unsigned int sse = 0; - cpi->fn_ptr[block_size].vf( + cpi->ppi->fn_ptr[block_size].vf( frame_to_filter->y_buffer + source_offset, source_y_stride, - cpi->alt_ref_buffer.y_buffer + filter_offset, filter_y_stride, &sse); + cpi->ppi->alt_ref_buffer.y_buffer + filter_offset, filter_y_stride, + &sse); diff->sum += sse; - diff->sse += sse * sse; + diff->sse += sse * (int64_t)sse; } } } @@ -939,8 +956,9 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi, const int lookahead_depth = av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage); - int arf_src_offset = cpi->gf_group.arf_src_offset[cpi->gf_group.index]; - const FRAME_TYPE frame_type = cpi->gf_group.frame_type[cpi->gf_group.index]; + int arf_src_offset = cpi->ppi->gf_group.arf_src_offset[cpi->gf_frame_index]; + const FRAME_TYPE frame_type = + cpi->ppi->gf_group.frame_type[cpi->gf_frame_index]; // Temporal filtering should not go beyond key frames const int key_to_curframe = @@ -949,10 +967,10 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi, AOMMAX(cpi->rc.frames_to_key - arf_src_offset - 1, 0); // Number of buffered frames before the to-filter frame. - const int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe); + int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe); // Number of buffered frames after the to-filter frame. - const int max_after = + int max_after = AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key); // Estimate noises for each plane. @@ -964,26 +982,34 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi, double *noise_levels = tf_ctx->noise_levels; for (int plane = 0; plane < num_planes; ++plane) { noise_levels[plane] = av1_estimate_noise_from_single_plane( - to_filter_frame, plane, cpi->common.seq_params.bit_depth); + to_filter_frame, plane, cpi->common.seq_params->bit_depth); } // Get quantization factor. const int q = av1_get_q(cpi); - // Get correlation estimates from first-pass - RATE_CONTROL *rc = &cpi->rc; - const double *coeff = rc->cor_coeff; - const int offset = rc->regions_offset; - int cur_frame_idx = - filter_frame_lookahead_idx + rc->frames_since_key - offset; - + // Get correlation estimates from first-pass; + const FIRSTPASS_STATS *stats = + cpi->ppi->twopass.stats_in - (cpi->rc.frames_since_key == 0); double accu_coeff0 = 1.0, accu_coeff1 = 1.0; for (int i = 1; i <= max_after; i++) { - accu_coeff1 *= coeff[cur_frame_idx + i]; + if (stats + filter_frame_lookahead_idx + i >= + cpi->ppi->twopass.stats_buf_ctx->stats_in_end) { + max_after = i - 1; + break; + } + accu_coeff1 *= + AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001); } if (max_after >= 1) { accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after); } for (int i = 1; i <= max_before; i++) { - accu_coeff0 *= coeff[cur_frame_idx - i + 1]; + if (stats + filter_frame_lookahead_idx - i + 1 <= + cpi->ppi->twopass.stats_buf_ctx->stats_in_start) { + max_before = i - 1; + break; + } + accu_coeff0 *= + AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001); } if (max_before >= 1) { accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before); @@ -1008,7 +1034,7 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi, num_before = AOMMIN(num_frames - 1, max_before); num_after = 0; } else { - num_frames = AOMMIN(num_frames, cpi->rc.gfu_boost / 150); + num_frames = AOMMIN(num_frames, cpi->ppi->p_rc.gfu_boost / 150); num_frames += !(num_frames & 1); // Make the number odd. // Only use 2 neighbours for the second ARF. if (is_second_arf) num_frames = AOMMIN(num_frames, 3); @@ -1051,10 +1077,10 @@ static void tf_setup_filtering_buffer(AV1_COMP *cpi, assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame); av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes, - cpi->common.seq_params.sb_size); + cpi->common.seq_params->sb_size); av1_setup_block_planes(&cpi->td.mb.e_mbd, - cpi->common.seq_params.subsampling_x, - cpi->common.seq_params.subsampling_y, num_planes); + cpi->common.seq_params->subsampling_x, + cpi->common.seq_params->subsampling_y, num_planes); } /*!\cond */ @@ -1174,8 +1200,8 @@ int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx, int *show_existing_arf) { MultiThreadInfo *const mt_info = &cpi->mt_info; // Basic informaton of the current frame. - const GF_GROUP *const gf_group = &cpi->gf_group; - const uint8_t group_idx = gf_group->index; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; + const uint8_t group_idx = cpi->gf_frame_index; TemporalFilterCtx *tf_ctx = &cpi->tf_ctx; TemporalFilterData *tf_data = &cpi->td.tf_data; // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame). @@ -1236,9 +1262,9 @@ int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx, int top_index = 0; int bottom_index = 0; const int q = av1_rc_pick_q_and_bounds( - cpi, &cpi->rc, cpi->oxcf.frm_dim_cfg.width, - cpi->oxcf.frm_dim_cfg.height, group_idx, &bottom_index, &top_index); - const int ac_q = av1_ac_quant_QTX(q, 0, cpi->common.seq_params.bit_depth); + cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, + group_idx, &bottom_index, &top_index); + const int ac_q = av1_ac_quant_QTX(q, 0, cpi->common.seq_params->bit_depth); const float threshold = 0.7f * ac_q * ac_q; if (!is_second_arf) { diff --git a/third_party/libaom/source/libaom/av1/encoder/temporal_filter.h b/third_party/libaom/source/libaom/av1/encoder/temporal_filter.h index 2ae7dd4bda..3b9563755c 100644 --- a/third_party/libaom/source/libaom/av1/encoder/temporal_filter.h +++ b/third_party/libaom/source/libaom/av1/encoder/temporal_filter.h @@ -64,6 +64,14 @@ struct ThreadData; // then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold // for 360p videos will be 360 * 0.1 = 36. #define TF_SEARCH_DISTANCE_THRESHOLD 0.1 +// 6. Threshold to identify if the q is in a relative high range. +// Above this cutoff q, a stronger filtering is applied. +// For a high q, the quantization throws away more information, and thus a +// stronger filtering is less likely to distort the encoded quality, while a +// stronger filtering could reduce bit rates. +// Ror a low q, more details are expected to be retained. Filtering is thus +// more conservative. +#define TF_QINDEX_CUTOFF 128 #define NOISE_ESTIMATION_EDGE_THRESHOLD 50 @@ -276,11 +284,6 @@ static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data, aom_free(tf_data->pred); } -// Helper function to compute number of blocks on either side of the frame. -static INLINE int get_num_blocks(const int frame_length, const int mb_length) { - return (frame_length + mb_length - 1) / mb_length; -} - // Saves the state prior to temporal filter process. // Inputs: // mbd: Pointer to the block for filtering. diff --git a/third_party/libaom/source/libaom/av1/encoder/tokenize.c b/third_party/libaom/source/libaom/av1/encoder/tokenize.c index bc63cc00ae..7e16b29a9a 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tokenize.c +++ b/third_party/libaom/source/libaom/av1/encoder/tokenize.c @@ -155,16 +155,18 @@ static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size, const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int step = bsw * bsh; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); assert(bsw > 0 && bsh > 0); - for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) { - for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) { - const int offsetr = blk_row + row; + for (int row = 0; row < row_end; row += bsh) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += bsw) { const int offsetc = blk_col + col; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane, arg); block += step; diff --git a/third_party/libaom/source/libaom/av1/encoder/tokenize.h b/third_party/libaom/source/libaom/av1/encoder/tokenize.h index 51eb28cee6..f31dc96958 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tokenize.h +++ b/third_party/libaom/source/libaom/av1/encoder/tokenize.h @@ -119,8 +119,8 @@ static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols, // Allocate memory for token related info. static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info) { int mi_rows_aligned_to_sb = - ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2); - int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2; + ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2); + int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params->mib_size_log2; const int num_planes = av1_num_planes(cm); unsigned int tokens = get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols, diff --git a/third_party/libaom/source/libaom/av1/encoder/tpl_model.c b/third_party/libaom/source/libaom/av1/encoder/tpl_model.c index 6ae957d4e5..e07ab3e311 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tpl_model.c +++ b/third_party/libaom/source/libaom/av1/encoder/tpl_model.c @@ -35,38 +35,48 @@ #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/tpl_model.h" -static AOM_INLINE int tpl_use_multithread(const AV1_COMP *cpi) { - return cpi->mt_info.num_workers > 1 && !cpi->sf.tpl_sf.allow_compound_pred; +static INLINE double exp_bounded(double v) { + // When v > 700 or <-700, the exp function will be close to overflow + // For details, see the "Notes" in the following link. + // https://en.cppreference.com/w/c/numeric/math/exp + if (v > 700) { + return DBL_MAX; + } else if (v < -700) { + return 0; + } + return exp(v); } -static AOM_INLINE void tpl_stats_record_txfm_block(TplDepFrame *tpl_frame, - const tran_low_t *coeff) { - aom_clear_system_state(); - // For transform larger than 16x16, the scale of coeff need to be adjusted. - // It's not LOSSLESS_Q_STEP. - assert(tpl_frame->coeff_num <= 256); - for (int i = 0; i < tpl_frame->coeff_num; ++i) { - tpl_frame->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP; +void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) { + tpl_txfm_stats->coeff_num = 256; + tpl_txfm_stats->txfm_block_count = 0; + memset(tpl_txfm_stats->abs_coeff_sum, 0, + sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num); +} + +void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats, + TplTxfmStats *accumulated_stats) { + accumulated_stats->txfm_block_count += sub_stats->txfm_block_count; + for (int i = 0; i < accumulated_stats->coeff_num; ++i) { + accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i]; } - ++tpl_frame->txfm_block_count; } -static AOM_INLINE void tpl_stats_update_abs_coeff_mean(TplDepFrame *tpl_frame) { - aom_clear_system_state(); - for (int i = 0; i < tpl_frame->coeff_num; ++i) { - tpl_frame->abs_coeff_mean[i] = - tpl_frame->abs_coeff_sum[i] / tpl_frame->txfm_block_count; +void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats, + const tran_low_t *coeff) { + // For transform larger than 16x16, the scale of coeff need to be adjusted. + // It's not LOSSLESS_Q_STEP. + assert(tpl_txfm_stats->coeff_num <= 256); + for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) { + tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP; } + ++tpl_txfm_stats->txfm_block_count; } -void av1_tpl_stats_init_txfm_stats(TplDepFrame *tpl_frame, int tpl_bsize_1d) { - aom_clear_system_state(); - tpl_frame->txfm_block_count = 0; - tpl_frame->coeff_num = tpl_bsize_1d * tpl_bsize_1d; - memset(tpl_frame->abs_coeff_sum, 0, sizeof(tpl_frame->abs_coeff_sum)); - assert(sizeof(tpl_frame->abs_coeff_mean) / - sizeof(tpl_frame->abs_coeff_mean[0]) == - tpl_frame->coeff_num); +static AOM_INLINE void av1_tpl_store_txfm_stats( + TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats, + const int frame_index) { + tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats; } static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane, @@ -118,9 +128,11 @@ static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2, assert(*tpl_bsize_1d >= 16); } -void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data, - int lag_in_frames) { - CommonModeInfoParams *const mi_params = &cm->mi_params; +void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi, + CommonModeInfoParams *const mi_params, int width, + int height, int byte_alignment, int lag_in_frames) { + SequenceHeader *const seq_params = &ppi->seq_params; + TplParams *const tpl_data = &ppi->tpl_data; set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2, &tpl_data->tpl_bsize_1d); const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; @@ -139,7 +151,6 @@ void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data, tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width; tpl_frame->mi_rows = mi_params->mi_rows; tpl_frame->mi_cols = mi_params->mi_cols; - av1_tpl_stats_init_txfm_stats(tpl_frame, tpl_data->tpl_bsize_1d); } tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1]; @@ -150,47 +161,33 @@ void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data, // TODO(aomedia:2873): Explore the allocation of tpl buffers based on // lag_in_frames. for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { - CHECK_MEM_ERROR( - cm, tpl_data->tpl_stats_pool[frame], + AOM_CHECK_MEM_ERROR( + &ppi->error, tpl_data->tpl_stats_pool[frame], aom_calloc(tpl_data->tpl_stats_buffer[frame].width * tpl_data->tpl_stats_buffer[frame].height, sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr))); - if (aom_alloc_frame_buffer( - &tpl_data->tpl_rec_pool[frame], cm->width, cm->height, - cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, - cm->seq_params.use_highbitdepth, tpl_data->border_in_pixels, - cm->features.byte_alignment)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + + if (aom_alloc_frame_buffer(&tpl_data->tpl_rec_pool[frame], width, height, + seq_params->subsampling_x, + seq_params->subsampling_y, + seq_params->use_highbitdepth, + tpl_data->border_in_pixels, byte_alignment)) + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } } -static AOM_INLINE void tpl_fwd_txfm(const int16_t *src_diff, int bw, - tran_low_t *coeff, TX_SIZE tx_size, - int bit_depth, int is_hbd) { - TxfmParam txfm_param; - txfm_param.tx_type = DCT_DCT; - txfm_param.tx_size = tx_size; - txfm_param.lossless = 0; - txfm_param.tx_set_type = EXT_TX_SET_ALL16; - - txfm_param.bd = bit_depth; - txfm_param.is_hbd = is_hbd; - av1_fwd_txfm(src_diff, coeff, bw, &txfm_param); -} - -static AOM_INLINE int64_t tpl_get_satd_cost(const MACROBLOCK *x, +static AOM_INLINE int64_t tpl_get_satd_cost(BitDepthInfo bd_info, int16_t *src_diff, int diff_stride, const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, tran_low_t *coeff, int bw, int bh, TX_SIZE tx_size) { - const MACROBLOCKD *xd = &x->e_mbd; const int pix_num = bw * bh; - av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst, - dst_stride); - tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd)); + av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride, + dst, dst_stride); + av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff); return aom_satd(coeff, pix_num); } @@ -198,7 +195,6 @@ static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT]; assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); - aom_clear_system_state(); int rate_cost = 1; for (int idx = 0; idx < eob; ++idx) { @@ -215,11 +211,11 @@ static AOM_INLINE void txfm_quant_rdcost( tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size, int *rate_cost, int64_t *recon_error, int64_t *sse) { const MACROBLOCKD *xd = &x->e_mbd; + const BitDepthInfo bd_info = get_bit_depth_info(xd); uint16_t eob; - av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst, - dst_stride); - tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd, - is_cur_buf_hbd(xd)); + av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride, + dst, dst_stride); + av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff); get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error, sse); @@ -316,13 +312,16 @@ static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs, } static void get_rate_distortion( - int *rate_cost, int64_t *recon_error, int16_t *src_diff, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x, + int *rate_cost, int64_t *recon_error, int64_t *pred_error, + int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff, + tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x, const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3], const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode, int mi_row, int mi_col, int use_y_only_rate_distortion) { + const SequenceHeader *seq_params = cm->seq_params; *rate_cost = 0; *recon_error = 1; + *pred_error = 1; MACROBLOCKD *xd = &x->e_mbd; int is_compound = (best_mode == NEW_NEWMV); @@ -356,7 +355,8 @@ static void get_rate_distortion( for (int ref = 0; ref < 1 + is_compound; ++ref) { if (!is_inter_mode(best_mode)) { av1_predict_intra_block( - cm, xd, block_size_wide[bsize_plane], block_size_high[bsize_plane], + xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, + block_size_wide[bsize_plane], block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer, dst_buffer_stride, 0, 0, plane); @@ -405,21 +405,24 @@ static void get_rate_distortion( &this_rate, &this_recon_error, &sse); *recon_error += this_recon_error; + *pred_error += sse; *rate_cost += this_rate; } } -static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, - int mi_col, BLOCK_SIZE bsize, - TX_SIZE tx_size, +static AOM_INLINE void mode_estimation(AV1_COMP *cpi, + TplTxfmStats *tpl_txfm_stats, + MACROBLOCK *x, int mi_row, int mi_col, + BLOCK_SIZE bsize, TX_SIZE tx_size, TplDepStats *tpl_stats) { AV1_COMMON *cm = &cpi->common; - const GF_GROUP *gf_group = &cpi->gf_group; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; (void)gf_group; MACROBLOCKD *xd = &x->e_mbd; - TplParams *tpl_data = &cpi->tpl_data; + const BitDepthInfo bd_info = get_bit_depth_info(xd); + TplParams *tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx]; const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2; @@ -471,6 +474,7 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, uint8_t *predictor = is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8; int64_t recon_error = 1; + int64_t pred_error = 1; memset(tpl_stats, 0, sizeof(*tpl_stats)); tpl_stats->ref_frame_index[0] = -1; @@ -493,7 +497,6 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, // Pre-load the bottom left line. if (xd->left_available && mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) { -#if CONFIG_AV1_HIGHBITDEPTH if (is_cur_buf_hbd(xd)) { uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer); for (int i = 0; i < bw; ++i) @@ -504,26 +507,24 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, dst_buffer[(bw + i) * dst_buffer_stride - 1] = dst_buffer[(bw - 1) * dst_buffer_stride - 1]; } -#else - for (int i = 0; i < bw; ++i) - dst_buffer[(bw + i) * dst_buffer_stride - 1] = - dst_buffer[(bw - 1) * dst_buffer_stride - 1]; -#endif } // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED, // H_PRED, and V_PRED const PREDICTION_MODE last_intra_mode = cpi->sf.tpl_sf.prune_intra_modes ? D45_PRED : INTRA_MODE_END; + const SequenceHeader *seq_params = cm->seq_params; for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode; ++mode) { - av1_predict_intra_block(cm, xd, block_size_wide[bsize], - block_size_high[bsize], tx_size, mode, 0, 0, - FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, - predictor, bw, 0, 0, 0); + av1_predict_intra_block(xd, seq_params->sb_size, + seq_params->enable_intra_edge_filter, + block_size_wide[bsize], block_size_high[bsize], + tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer, + dst_buffer_stride, predictor, bw, 0, 0, 0); - intra_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride, - predictor, bw, coeff, bw, bh, tx_size); + intra_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); if (intra_cost < best_intra_cost) { best_intra_cost = intra_cost; @@ -607,7 +608,7 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, for (idx = 0; idx < refmv_count; ++idx) { FULLPEL_MV mv = get_fullmv_from_mv(¢er_mvs[idx].mv.as_mv); clamp_fullmv(&mv, &x->mv_limits); - center_mvs[idx].sad = (int)cpi->fn_ptr[bsize].sdf( + center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf( src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col], ref_stride); } @@ -653,8 +654,9 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv, &inter_pred_params); - inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride, - predictor, bw, coeff, bw, bh, tx_size); + inter_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); // Store inter cost for each ref frame tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost); @@ -732,8 +734,9 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv, &inter_pred_params); } - inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride, - predictor, bw, coeff, bw, bh, tx_size); + inter_cost = + tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride, + predictor, bw, coeff, bw, bh, tx_size); if (inter_cost < best_inter_cost) { best_cmp_rf_idx = cmp_rf_idx; best_inter_cost = inter_cost; @@ -760,8 +763,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, : NULL, }; int rate_cost = 1; - get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff, - dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion); tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; @@ -772,7 +775,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2; tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2; - tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2); + tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; // Final encode int rate_cost = 0; @@ -786,21 +790,19 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, best_mode == NEW_NEWMV ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]] : NULL; - get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff, - dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion); - if (!tpl_use_multithread(cpi)) { - // TODO(angiebird): make this work for multithread - tpl_stats_record_txfm_block(tpl_frame, coeff); - } + av1_record_tpl_txfm_block(tpl_txfm_stats, coeff); tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2); tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; if (!is_inter_mode(best_mode)) { tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2); tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2; } tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist); @@ -810,8 +812,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]; ref_frame_ptr[1] = tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]; - get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff, - dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion); tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2; @@ -831,8 +833,8 @@ static AOM_INLINE void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, ref_frame_ptr[0] = tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]; ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]; - get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff, - dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, + get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff, + qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool, rec_stride_pool, tx_size, best_mode, mi_row, mi_col, use_y_only_rate_distortion); tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2; @@ -887,41 +889,24 @@ static int round_floor(int ref_pos, int bsize_pix) { return round; } -static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, - int ref_pos_col, int block, BLOCK_SIZE bsize) { - int width = 0, height = 0; - int bw = 4 << mi_size_wide_log2[bsize]; - int bh = 4 << mi_size_high_log2[bsize]; - - switch (block) { - case 0: - width = grid_pos_col + bw - ref_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 1: - width = ref_pos_col + bw - grid_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 2: - width = grid_pos_col + bw - ref_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - case 3: - width = ref_pos_col + bw - grid_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - default: assert(0); +int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width, + int height) { + int min_row = AOMMAX(row_a, row_b); + int max_row = AOMMIN(row_a + height, row_b + height); + int min_col = AOMMAX(col_a, col_b); + int max_col = AOMMIN(col_a + width, col_b + width); + if (min_row < max_row && min_col < max_col) { + return (max_row - min_row) * (max_col - min_col); } - int overlap_area = width * height; - return overlap_area; + return 0; } int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) { return (mi_row >> right_shift) * stride + (mi_col >> right_shift); } -static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, - int64_t srcrf_dist, int pix_num) { +int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, + int64_t srcrf_dist, int pix_num) { double beta = (double)srcrf_dist / recrf_dist; int64_t rate_cost = delta_rate; @@ -952,7 +937,6 @@ static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row, int mi_col, const BLOCK_SIZE bsize, int frame_idx, int ref) { - aom_clear_system_state(); TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx]; TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr; TplDepFrame *tpl_frame = tpl_data->tpl_frame; @@ -998,8 +982,8 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row, tpl_stats_ptr->recrf_dist)); int64_t delta_rate = tpl_stats_ptr->recrf_rate - srcrf_rate; int64_t mc_dep_rate = - delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist, - srcrf_dist, pix_num); + av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist, + srcrf_dist, pix_num); for (block = 0; block < 4; ++block) { int grid_pos_row = grid_pos_row_base + bh * (block >> 1); @@ -1007,8 +991,8 @@ static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row, if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { - int overlap_area = get_overlap_area( - grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col, + ref_pos_row, ref_pos_col, bw, bh); int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; assert((1 << block_mis_log2) == mi_height); @@ -1043,6 +1027,7 @@ static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row, tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost); tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost); tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist); + tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse); tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist); tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate); tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate); @@ -1068,12 +1053,12 @@ static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) { // Initialize the mc_flow parameters used in computing tpl data. static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx, int pframe_qindex) { - TplParams *const tpl_data = &cpi->tpl_data; + TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx]; const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture; const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME]; uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME]; - GF_GROUP *gf_group = &cpi->gf_group; + GF_GROUP *gf_group = &cpi->ppi->gf_group; int ref_pruning_enabled = is_frame_eligible_for_ref_pruning( gf_group, cpi->sf.inter_sf.selective_ref_frame, cpi->sf.tpl_sf.prune_ref_frames_in_tpl, frame_idx); @@ -1084,6 +1069,7 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx, ThreadData *td = &cpi->td; MACROBLOCK *x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; + TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats; tpl_data->frame_idx = frame_idx; tpl_reset_src_ref_frames(tpl_data); av1_tile_init(&xd->tile, cm, 0, 0); @@ -1161,18 +1147,21 @@ static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx, tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6; + + av1_init_tpl_txfm_stats(tpl_txfm_stats); } // This function stores the motion estimation dependencies of all the blocks in // a row -void av1_mc_flow_dispenser_row(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, - BLOCK_SIZE bsize, TX_SIZE tx_size) { +void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats, + MACROBLOCK *x, int mi_row, BLOCK_SIZE bsize, + TX_SIZE tx_size) { AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt; const CommonModeInfoParams *const mi_params = &cm->mi_params; const int mi_width = mi_size_wide[bsize]; - TplParams *const tpl_data = &cpi->tpl_data; + TplParams *const tpl_data = &cpi->ppi->tpl_data; TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx]; MACROBLOCKD *xd = &x->e_mbd; @@ -1194,7 +1183,8 @@ void av1_mc_flow_dispenser_row(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE); xd->mb_to_right_edge = GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col); - mode_estimation(cpi, x, mi_row, mi_col, bsize, tx_size, &tpl_stats); + mode_estimation(cpi, tpl_txfm_stats, x, mi_row, mi_col, bsize, tx_size, + &tpl_stats); // Motion flow dependency dispenser. tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride, @@ -1210,40 +1200,36 @@ static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) { ThreadData *td = &cpi->td; MACROBLOCK *x = &td->mb; MACROBLOCKD *xd = &x->e_mbd; - const BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d); + const BLOCK_SIZE bsize = + convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d); const TX_SIZE tx_size = max_txsize_lookup[bsize]; const int mi_height = mi_size_high[bsize]; for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) { // Motion estimation row boundary av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height, - cpi->tpl_data.border_in_pixels); + cpi->ppi->tpl_data.border_in_pixels); xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE); xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE); - av1_mc_flow_dispenser_row(cpi, x, mi_row, bsize, tx_size); - } - if (!tpl_use_multithread(cpi)) { - // TODO(angiebird): make this work for multithread - TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[cpi->tpl_data.frame_idx]; - tpl_stats_update_abs_coeff_mean(tpl_frame); + av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, x, mi_row, bsize, + tx_size); } } -static void mc_flow_synthesizer(AV1_COMP *cpi, int frame_idx) { - AV1_COMMON *cm = &cpi->common; - TplParams *const tpl_data = &cpi->tpl_data; - +static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows, + int mi_cols) { + if (!frame_idx) { + return; + } const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d); const int mi_height = mi_size_high[bsize]; const int mi_width = mi_size_wide[bsize]; assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2)); assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2)); - for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += mi_height) { - for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += mi_width) { - if (frame_idx) { - tpl_model_update(tpl_data, mi_row, mi_col, frame_idx); - } + for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) { + for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) { + tpl_model_update(tpl_data, mi_row, mi_col, frame_idx); } } } @@ -1253,12 +1239,17 @@ static AOM_INLINE void init_gop_frames_for_tpl( GF_GROUP *gf_group, int gop_eval, int *tpl_group_frames, const EncodeFrameInput *const frame_input, int *pframe_qindex) { AV1_COMMON *cm = &cpi->common; - int cur_frame_idx = gf_group->index; + int cur_frame_idx = cpi->gf_frame_index; *pframe_qindex = 0; +#if CONFIG_FRAME_PARALLEL_ENCODE + RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]; + init_ref_map_pair(cpi, ref_frame_map_pairs); +#endif // CONFIG_FRAME_PARALLEL_ENCODE + RefBufferStack ref_buffer_stack = cpi->ref_buffer_stack; EncodeFrameParams frame_params = *init_frame_params; - TplParams *const tpl_data = &cpi->tpl_data; + TplParams *const tpl_data = &cpi->ppi->tpl_data; int ref_picture_map[REF_FRAMES]; @@ -1288,7 +1279,7 @@ static AOM_INLINE void init_gop_frames_for_tpl( TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index]; FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index]; int frame_display_index = gf_index == gf_group->size - ? cpi->rc.baseline_gf_interval + ? cpi->ppi->p_rc.baseline_gf_interval : gf_group->cur_frame_idx[gf_index] + gf_group->arf_src_offset[gf_index]; @@ -1317,7 +1308,7 @@ static AOM_INLINE void init_gop_frames_for_tpl( } if (gop_eval && cpi->rc.frames_since_key > 0 && gf_group->arf_index == gf_index) - tpl_frame->gf_picture = &cpi->alt_ref_buffer; + tpl_frame->gf_picture = &cpi->ppi->alt_ref_buffer; // 'cm->current_frame.frame_number' is the display number // of the current frame. @@ -1338,15 +1329,45 @@ static AOM_INLINE void init_gop_frames_for_tpl( tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count]; ++process_frame_count; } - - av1_get_ref_frames(cpi, &ref_buffer_stack); - int refresh_mask = av1_get_refresh_frame_flags( - cpi, &frame_params, frame_update_type, &ref_buffer_stack); +#if CONFIG_FRAME_PARALLEL_ENCODE + const int true_disp = (int)(tpl_frame->frame_display_index); +#endif // CONFIG_FRAME_PARALLEL_ENCODE + + av1_get_ref_frames(&ref_buffer_stack, +#if CONFIG_FRAME_PARALLEL_ENCODE + cpi, ref_frame_map_pairs, true_disp, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + cm->remapped_ref_idx); + + int refresh_mask = + av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, +#if CONFIG_FRAME_PARALLEL_ENCODE + true_disp, ref_frame_map_pairs, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + &ref_buffer_stack); + +#if CONFIG_FRAME_PARALLEL_ENCODE + // Make the frames marked as is_frame_non_ref to non-reference frames. + if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0; +#endif // CONFIG_FRAME_PARALLEL_ENCODE int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); +#if !CONFIG_FRAME_PARALLEL_ENCODE av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type, frame_params.show_existing_frame, refresh_frame_map_index, &ref_buffer_stack); +#endif // CONFIG_FRAME_PARALLEL_ENCODE + +#if CONFIG_FRAME_PARALLEL_ENCODE + if (refresh_frame_map_index < REF_FRAMES && + refresh_frame_map_index != INVALID_IDX) { + ref_frame_map_pairs[refresh_frame_map_index].disp_order = + AOMMAX(0, true_disp); + ref_frame_map_pairs[refresh_frame_map_index].pyr_level = + get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp, + cpi->ppi->gf_group.max_layer_depth); + } +#endif // CONFIG_FRAME_PARALLEL_ENCODE for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) tpl_frame->ref_map_index[i - LAST_FRAME] = @@ -1360,8 +1381,9 @@ static AOM_INLINE void init_gop_frames_for_tpl( if (cpi->rc.frames_since_key == 0) return; int extend_frame_count = 0; - int extend_frame_length = AOMMIN( - MAX_TPL_EXTEND, cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval); + int extend_frame_length = + AOMMIN(MAX_TPL_EXTEND, + cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval); int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] + gf_group->arf_src_offset[gop_length - 1] + 1; @@ -1400,14 +1422,37 @@ static AOM_INLINE void init_gop_frames_for_tpl( gf_group->update_type[gf_index] = LF_UPDATE; gf_group->q_val[gf_index] = *pframe_qindex; - - av1_get_ref_frames(cpi, &ref_buffer_stack); - int refresh_mask = av1_get_refresh_frame_flags( - cpi, &frame_params, frame_update_type, &ref_buffer_stack); +#if CONFIG_FRAME_PARALLEL_ENCODE + const int true_disp = (int)(tpl_frame->frame_display_index); +#endif // CONFIG_FRAME_PARALLEL_ENCODE + av1_get_ref_frames(&ref_buffer_stack, +#if CONFIG_FRAME_PARALLEL_ENCODE + cpi, ref_frame_map_pairs, true_disp, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + cm->remapped_ref_idx); + int refresh_mask = + av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type, +#if CONFIG_FRAME_PARALLEL_ENCODE + true_disp, ref_frame_map_pairs, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + &ref_buffer_stack); int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask); +#if !CONFIG_FRAME_PARALLEL_ENCODE av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type, frame_params.show_existing_frame, refresh_frame_map_index, &ref_buffer_stack); +#endif // CONFIG_FRAME_PARALLEL_ENCODE + +#if CONFIG_FRAME_PARALLEL_ENCODE + if (refresh_frame_map_index < REF_FRAMES && + refresh_frame_map_index != INVALID_IDX) { + ref_frame_map_pairs[refresh_frame_map_index].disp_order = + AOMMAX(0, true_disp); + ref_frame_map_pairs[refresh_frame_map_index].pyr_level = + get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp, + cpi->ppi->gf_group.max_layer_depth); + } +#endif // CONFIG_FRAME_PARALLEL_ENCODE for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) tpl_frame->ref_map_index[i - LAST_FRAME] = @@ -1424,8 +1469,16 @@ static AOM_INLINE void init_gop_frames_for_tpl( ++extend_frame_count; ++frame_display_index; } - - av1_get_ref_frames(cpi, &cpi->ref_buffer_stack); +#if CONFIG_FRAME_PARALLEL_ENCODE + TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cur_frame_idx]; + const int true_disp = (int)(tpl_frame->frame_display_index); + init_ref_map_pair(cpi, ref_frame_map_pairs); +#endif // CONFIG_FRAME_PARALLEL_ENCODE + av1_get_ref_frames(&cpi->ref_buffer_stack, +#if CONFIG_FRAME_PARALLEL_ENCODE + cpi, ref_frame_map_pairs, true_disp, +#endif // CONFIG_FRAME_PARALLEL_ENCODE + cm->remapped_ref_idx); } void av1_init_tpl_stats(TplParams *const tpl_data) { @@ -1440,9 +1493,47 @@ void av1_init_tpl_stats(TplParams *const tpl_data) { sizeof(*tpl_frame->tpl_stats_ptr)); tpl_frame->is_valid = 0; } - for (frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) { - TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx]; - av1_tpl_stats_init_txfm_stats(tpl_frame, tpl_data->tpl_bsize_1d); +#if CONFIG_BITRATE_ACCURACY + tpl_data->estimated_gop_bitrate = 0; + tpl_data->actual_gop_bitrate = 0; +#endif +} + +static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) { + switch (gop_eval) { + case 1: + // Allow larger GOP size if the base layer ARF has higher dependency + // factor than the intermediate ARF and both ARFs have reasonably high + // dependency factors. + return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0; + case 2: + if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6) + return 1; // Don't shorten the gf interval + else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4) + return 0; // Shorten the gf interval + else + return 2; // Cannot decide the gf interval, so redo the + // tpl stats calculation. + case 3: return beta[0] > 1.1; + default: return 2; + } +} + +// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down +// the scope of input arguments. +void av1_tpl_preload_rc_estimate(AV1_COMP *cpi, + const EncodeFrameParams *const frame_params) { + AV1_COMMON *cm = &cpi->common; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + int bottom_index, top_index; + cm->current_frame.frame_type = frame_params->frame_type; + for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size; + ++gf_index) { + cm->current_frame.frame_type = gf_group->frame_type[gf_index]; + cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE && + gf_group->update_type[gf_index] != INTNL_ARF_UPDATE; + gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds( + cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index); } } @@ -1455,10 +1546,17 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, AV1_COMMON *cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt; - GF_GROUP *gf_group = &cpi->gf_group; - int bottom_index, top_index; + GF_GROUP *gf_group = &cpi->ppi->gf_group; EncodeFrameParams this_frame_params = *frame_params; - TplParams *const tpl_data = &cpi->tpl_data; + TplParams *const tpl_data = &cpi->ppi->tpl_data; + int approx_gop_eval = (gop_eval > 1); + int num_arf_layers = MAX_ARF_LAYERS; + + // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base + // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3, + // tpl stats calculation is limited to ARFs from base layer and (base+1) + // layer. + if (approx_gop_eval) num_arf_layers = (gop_eval == 2) ? 3 : 2; if (cpi->superres_mode != AOM_SUPERRES_NONE) { assert(cpi->superres_mode != AOM_SUPERRES_AUTO); @@ -1467,7 +1565,8 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, } cm->current_frame.frame_type = frame_params->frame_type; - for (int gf_index = gf_group->index; gf_index < gf_group->size; ++gf_index) { + for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size; + ++gf_index) { cm->current_frame.frame_type = gf_group->frame_type[gf_index]; av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame, gf_group->update_type[gf_index], @@ -1475,13 +1574,6 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame, sizeof(cpi->refresh_frame)); - - cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE && - gf_group->update_type[gf_index] != INTNL_ARF_UPDATE; - - gf_group->q_val[gf_index] = - av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, gf_index, - &bottom_index, &top_index); } int pframe_qindex; @@ -1489,7 +1581,7 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, init_gop_frames_for_tpl(cpi, frame_params, gf_group, gop_eval, &tpl_gf_group_frames, frame_input, &pframe_qindex); - cpi->rc.base_layer_qp = pframe_qindex; + cpi->ppi->p_rc.base_layer_qp = pframe_qindex; av1_init_tpl_stats(tpl_data); @@ -1505,37 +1597,59 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv, cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs); + const int gop_length = get_gop_length(gf_group); // Backward propagation from tpl_group_frames to 1. - for (int frame_idx = gf_group->index; frame_idx < tpl_gf_group_frames; + for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames; ++frame_idx) { if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE || gf_group->update_type[frame_idx] == OVERLAY_UPDATE) continue; + // When approx_gop_eval = 1, skip tpl stats calculation for higher layer + // frames and for frames beyond gop length. + if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers || + frame_idx >= gop_length)) + continue; + init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex); - if (tpl_use_multithread(cpi)) { + if (mt_info->num_workers > 1) { tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read; tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write; av1_mc_flow_dispenser_mt(cpi); } else { mc_flow_dispenser(cpi); } + av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx); aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture, av1_num_planes(cm)); } - for (int frame_idx = tpl_gf_group_frames - 1; frame_idx >= gf_group->index; - --frame_idx) { +#if CONFIG_BITRATE_ACCURACY + tpl_data->estimated_gop_bitrate = av1_estimate_gop_bitrate( + gf_group->q_val, gf_group->size, tpl_data->txfm_stats_list); + if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && + gop_eval == 0) { + printf("\nestimated bitrate: %f\n", tpl_data->estimated_gop_bitrate); + } +#endif + + for (int frame_idx = tpl_gf_group_frames - 1; + frame_idx >= cpi->gf_frame_index; --frame_idx) { if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE || gf_group->update_type[frame_idx] == OVERLAY_UPDATE) continue; - mc_flow_synthesizer(cpi, frame_idx); + if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers || + frame_idx >= gop_length)) + continue; + + mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows, + cm->mi_params.mi_cols); } av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame, - gf_group->update_type[gf_group->index], + gf_group->update_type[cpi->gf_frame_index], frame_params->frame_type, 0); cm->current_frame.frame_type = frame_params->frame_type; cm->show_frame = frame_params->show_frame; @@ -1592,21 +1706,17 @@ int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval, #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_tpl_setup_stats_time); #endif - - // Allow larger GOP size if the base layer ARF has higher dependency factor - // than the intermediate ARF and both ARFs have reasonably high dependency - // factors. - return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0; + return eval_gop_length(beta, gop_eval); } void av1_tpl_rdmult_setup(AV1_COMP *cpi) { const AV1_COMMON *const cm = &cpi->common; - const GF_GROUP *const gf_group = &cpi->gf_group; - const int tpl_idx = gf_group->index; + const int tpl_idx = cpi->gf_frame_index; - assert(IMPLIES(gf_group->size > 0, tpl_idx < gf_group->size)); + assert( + IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size)); - TplParams *const tpl_data = &cpi->tpl_data; + TplParams *const tpl_data = &cpi->ppi->tpl_data; const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx]; if (!tpl_frame->is_valid) return; @@ -1623,8 +1733,6 @@ void av1_tpl_rdmult_setup(AV1_COMP *cpi) { const double c = 1.2; const int step = 1 << tpl_data->tpl_stats_block_mis_log2; - aom_clear_system_state(); - // Loop through each 'block_size' X 'block_size' block. for (int row = 0; row < num_rows; row++) { for (int col = 0; col < num_cols; col++) { @@ -1647,24 +1755,23 @@ void av1_tpl_rdmult_setup(AV1_COMP *cpi) { } const double rk = intra_cost / mc_dep_cost; const int index = row * num_cols + col; - cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c; + cpi->ppi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c; } } - aom_clear_system_state(); } void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x, BLOCK_SIZE sb_size, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; - GF_GROUP *gf_group = &cpi->gf_group; - assert(IMPLIES(cpi->gf_group.size > 0, - cpi->gf_group.index < cpi->gf_group.size)); - const int tpl_idx = cpi->gf_group.index; - TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx]; - - if (tpl_frame->is_valid == 0) return; - if (!is_frame_tpl_eligible(gf_group, gf_group->index)) return; + GF_GROUP *gf_group = &cpi->ppi->gf_group; + assert(IMPLIES(cpi->ppi->gf_group.size > 0, + cpi->gf_frame_index < cpi->ppi->gf_group.size)); + const int tpl_idx = cpi->gf_frame_index; + if (tpl_idx >= MAX_TPL_FRAME_IDX) return; + TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx]; + if (!tpl_frame->is_valid) return; + if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return; if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return; const int mi_col_sr = @@ -1685,13 +1792,12 @@ void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x, double base_block_count = 0.0; double log_sum = 0.0; - aom_clear_system_state(); for (row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (col = mi_col_sr / num_mi_h; col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) { const int index = row * num_cols + col; - log_sum += log(cpi->tpl_rdmult_scaling_factors[index]); + log_sum += log(cpi->ppi->tpl_rdmult_scaling_factors[index]); base_block_count += 1.0; } } @@ -1705,33 +1811,30 @@ void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x, const double scaling_factor = (double)new_rdmult / (double)orig_rdmult; double scale_adj = log(scaling_factor) - log_sum / base_block_count; - scale_adj = exp(scale_adj); + scale_adj = exp_bounded(scale_adj); for (row = mi_row / num_mi_w; row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) { for (col = mi_col_sr / num_mi_h; col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) { const int index = row * num_cols + col; - cpi->tpl_sb_rdmult_scaling_factors[index] = - scale_adj * cpi->tpl_rdmult_scaling_factors[index]; + cpi->ppi->tpl_sb_rdmult_scaling_factors[index] = + scale_adj * cpi->ppi->tpl_rdmult_scaling_factors[index]; } } - aom_clear_system_state(); } -#define EPSILON (0.0000001) - double av1_exponential_entropy(double q_step, double b) { - aom_clear_system_state(); - double z = fmax(exp(-q_step / b), EPSILON); + b = AOMMAX(b, TPL_EPSILON); + double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON); return -log2(1 - z) - z * log2(z) / (1 - z); } double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) { - aom_clear_system_state(); // zero bin's size is zero_bin_ratio * q_step // non-zero bin's size is q_step - double z = fmax(exp(-zero_bin_ratio / 2 * q_step / b), EPSILON); + b = AOMMAX(b, TPL_EPSILON); + double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); double h = av1_exponential_entropy(q_step, b); double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1); return r; @@ -1740,7 +1843,6 @@ double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) { double av1_laplace_estimate_frame_rate(int q_index, int block_count, const double *abs_coeff_mean, int coeff_num) { - aom_clear_system_state(); double zero_bin_ratio = 2; double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; @@ -1755,3 +1857,58 @@ double av1_laplace_estimate_frame_rate(int q_index, int block_count, est_rate *= block_count; return est_rate; } + +double av1_estimate_gop_bitrate(const unsigned char *q_index_list, + const int frame_count, + const TplTxfmStats *stats_list) { + double gop_bitrate = 0; + for (int frame_index = 0; frame_index < frame_count; frame_index++) { + int q_index = q_index_list[frame_index]; + TplTxfmStats frame_stats = stats_list[frame_index]; + + /* Convert to mean absolute deviation */ + double abs_coeff_mean[256] = { 0 }; + for (int i = 0; i < 256; i++) { + abs_coeff_mean[i] = + frame_stats.abs_coeff_sum[i] / frame_stats.txfm_block_count; + } + + double frame_bitrate = av1_laplace_estimate_frame_rate( + q_index, frame_stats.txfm_block_count, abs_coeff_mean, 256); + gop_bitrate += frame_bitrate; + } + return gop_bitrate; +} + +double av1_estimate_coeff_entropy(double q_step, double b, + double zero_bin_ratio, int qcoeff) { + b = AOMMAX(b, TPL_EPSILON); + int abs_qcoeff = abs(qcoeff); + double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); + if (abs_qcoeff == 0) { + double r = -log2(1 - z0); + return r; + } else { + double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON); + double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z); + return r; + } +} + +double av1_estimate_txfm_block_entropy(int q_index, + const double *abs_coeff_mean, + int *qcoeff_arr, int coeff_num) { + double zero_bin_ratio = 2; + double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.; + double est_rate = 0; + // dc coeff + est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0], + zero_bin_ratio, qcoeff_arr[0]); + // ac coeff + for (int i = 1; i < coeff_num; ++i) { + est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i], + zero_bin_ratio, qcoeff_arr[i]); + } + return est_rate; +} diff --git a/third_party/libaom/source/libaom/av1/encoder/tpl_model.h b/third_party/libaom/source/libaom/av1/encoder/tpl_model.h index 4b85740f3e..c764d92239 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tpl_model.h +++ b/third_party/libaom/source/libaom/av1/encoder/tpl_model.h @@ -18,11 +18,20 @@ extern "C" { /*!\cond */ +struct AV1_PRIMARY; struct AV1_COMP; +struct AV1_SEQ_CODING_TOOLS; struct EncodeFrameParams; struct EncodeFrameInput; -#include "av1/encoder/encoder.h" +#include "config/aom_config.h" + +#include "aom_scale/yv12config.h" + +#include "av1/common/mv.h" +#include "av1/common/scale.h" +#include "av1/encoder/block.h" +#include "av1/encoder/lookahead.h" static INLINE BLOCK_SIZE convert_length_to_bsize(int length) { switch (length) { @@ -82,6 +91,14 @@ typedef struct AV1TplRowMultiThreadInfo { #define MAX_TPL_EXTEND (MAX_LAG_BUFFERS - MAX_GF_INTERVAL) #define TPL_DEP_COST_SCALE_LOG2 4 +#define TPL_EPSILON 0.0000001 + +typedef struct TplTxfmStats { + double abs_coeff_sum[256]; // Assume we are using 16x16 transform block + int txfm_block_count; + int coeff_num; +} TplTxfmStats; + typedef struct TplDepStats { int64_t intra_cost; int64_t inter_cost; @@ -90,6 +107,7 @@ typedef struct TplDepStats { int64_t cmp_recrf_dist[2]; int64_t srcrf_rate; int64_t recrf_rate; + int64_t srcrf_sse; int64_t cmp_recrf_rate[2]; int64_t mc_dep_rate; int64_t mc_dep_dist; @@ -111,10 +129,6 @@ typedef struct TplDepFrame { int mi_cols; int base_rdmult; uint32_t frame_display_index; - double abs_coeff_sum[256]; // Assume we are using 16x16 transform block - double abs_coeff_mean[256]; - int coeff_num; // number of coefficients in a transform block - int txfm_block_count; } TplDepFrame; /*!\endcond */ @@ -147,6 +161,12 @@ typedef struct TplParams { TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS]; /*! + * Buffer to store tpl transform stats per frame. + * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group. + */ + TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS]; + + /*! * Buffer to store tpl reconstructed frame. * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group. */ @@ -192,10 +212,13 @@ typedef struct TplParams { */ int border_in_pixels; - /*! - * Skip tpl setup when tpl data from gop length decision can be reused. +#if CONFIG_BITRATE_ACCURACY + /* + * Estimated and actual GOP bitrate. */ - int skip_tpl_setup_stats; + double estimated_gop_bitrate; + double actual_gop_bitrate; +#endif } TplParams; /*!\brief Allocate buffers used by tpl model @@ -206,8 +229,9 @@ typedef struct TplParams { * \param[out] tpl_data tpl data structure */ -void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data, - int lag_in_frames); +void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi, + CommonModeInfoParams *const mi_params, int width, + int height, int byte_alignment, int lag_in_frames); /*!\brief Implements temporal dependency modelling for a GOP (GF/ARF * group) and selects between 16 and 32 frame GOP structure. @@ -227,6 +251,9 @@ int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval, /*!\cond */ +void av1_tpl_preload_rc_estimate( + struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params); + int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift); void av1_init_tpl_stats(TplParams *const tpl_data); @@ -236,8 +263,9 @@ void av1_tpl_rdmult_setup(struct AV1_COMP *cpi); void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x, BLOCK_SIZE sb_size, int mi_row, int mi_col); -void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, - BLOCK_SIZE bsize, TX_SIZE tx_size); +void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi, + TplTxfmStats *tpl_txfm_stats, MACROBLOCK *x, + int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size); /*!\brief Compute the entropy of an exponential probability distribution * function (pdf) subjected to uniform quantization. @@ -271,7 +299,7 @@ double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio); /*!\brief Compute the frame rate using transform block stats * * Assume each position i in the transform block is of Laplace distribution - * with maximum absolute deviation abs_coeff_mean[i] + * with mean absolute deviation abs_coeff_mean[i] * * Then we can use av1_laplace_entropy() to compute the expected frame * rate. @@ -280,7 +308,7 @@ double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio); * * \param[in] q_index quantizer index * \param[in] block_count number of transform blocks - * \param[in] abs_coeff_mean array of maximum absolute deviation + * \param[in] abs_coeff_mean array of mean absolute deviation * \param[in] coeff_num number of coefficients per transform block * * \return expected frame rate @@ -289,15 +317,104 @@ double av1_laplace_estimate_frame_rate(int q_index, int block_count, const double *abs_coeff_mean, int coeff_num); -/*!\brief Init data structure storing transform stats +/* + *!\brief Compute the number of bits needed to encode a GOP + * + * \param[in] q_index_list array of q_index, one per frame + * \param[in] frame_count number of frames in the GOP + * \param[in] stats array of transform stats, one per frame + * + */ +double av1_estimate_gop_bitrate(const unsigned char *q_index_list, + const int frame_count, + const TplTxfmStats *stats); + +/* + *!\brief Init TplTxfmStats + * + * \param[in] tpl_txfm_stats a structure for storing transform stats + * + * + */ +void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats); + +/* + *!\brief Accumulate TplTxfmStats + * + * \param[in] sub_stats a structure for storing sub transform stats + * \param[out] accumulated_stats a structure for storing accumulated transform + *stats + * + */ +void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats, + TplTxfmStats *accumulated_stats); + +/* + *!\brief Record a transform block into TplTxfmStats + * + * \param[in] tpl_txfm_stats A structure for storing transform stats + * \param[out] coeff An array of transform coefficients. Its size + * should equal to tpl_txfm_stats.coeff_num. + * + */ +void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats, + const tran_low_t *coeff); + +/*!\brief Estimate coefficient entropy using Laplace dsitribution * *\ingroup tpl_modelling * - * \param[in] tpl_frame pointer of tpl frame data structure + * This function is equivalent to -log2(laplace_prob()), where laplace_prob() is + * defined in tpl_model_test.cc + * + * \param[in] q_step quantizer step size without any scaling + * \param[in] b mean absolute deviation of Laplace distribution + * \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio * q_step + * \param[in] qcoeff quantized coefficient + * + * \return estimated coefficient entropy + * + */ +double av1_estimate_coeff_entropy(double q_step, double b, + double zero_bin_ratio, int qcoeff); + +/*!\brief Estimate entropy of a transform block using Laplace dsitribution + * + *\ingroup tpl_modelling + * + * \param[in] q_index quantizer index + * \param[in] abs_coeff_mean array of mean absolute deviations + * \param[in] qcoeff_arr array of quantized coefficients * \param[in] coeff_num number of coefficients per transform block * + * \return estimated transform block entropy + * + */ +double av1_estimate_txfm_block_entropy(int q_index, + const double *abs_coeff_mean, + int *qcoeff_arr, int coeff_num); + +// TODO(angiebird): Add doxygen description here. +int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist, + int64_t srcrf_dist, int pix_num); + +/*!\brief Compute the overlap area between two blocks with the same size + * + *\ingroup tpl_modelling + * + * If there is no overlap, this function should return zero. + * + * \param[in] row_a row position of the first block + * \param[in] col_a column position of the first block + * \param[in] row_b row position of the second block + * \param[in] col_b column position of the second block + * \param[in] width width shared by the two blocks + * \param[in] height height shared by the two blocks + * + * \return overlap area of the two blocks */ -void av1_tpl_stats_init_txfm_stats(TplDepFrame *tpl_frame, int coeff_num); +int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width, + int height); /*!\endcond */ #ifdef __cplusplus diff --git a/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.c b/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.c index 39940e8aa6..f82e910595 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.c +++ b/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.c @@ -15,24 +15,34 @@ #include "aom_dsp/butteraugli.h" #include "aom_ports/system_state.h" -#include "av1/encoder/rdopt.h" +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encoder_utils.h" #include "av1/encoder/extend.h" +#include "av1/encoder/var_based_part.h" static const int resize_factor = 2; -void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi, - const YV12_BUFFER_CONFIG *source, - const YV12_BUFFER_CONFIG *recon) { +static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi, + const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *recon, + const double K) { AV1_COMMON *const cm = &cpi->common; + SequenceHeader *const seq_params = cm->seq_params; const CommonModeInfoParams *const mi_params = &cm->mi_params; + const aom_color_range_t color_range = + seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE; const int bit_depth = cpi->td.mb.e_mbd.bd; const int width = source->y_crop_width; const int height = source->y_crop_height; + const int ss_x = source->subsampling_x; + const int ss_y = source->subsampling_y; float *diffmap; CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap))); - if (!aom_calc_butteraugli(source, recon, bit_depth, diffmap)) { - aom_internal_error(&cm->error, AOM_CODEC_ERROR, + if (!aom_calc_butteraugli(source, recon, bit_depth, + seq_params->matrix_coefficients, color_range, + diffmap)) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Failed to calculate Butteraugli distances."); } @@ -55,6 +65,7 @@ void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi, const int x_start = col * block_w; float dbutteraugli = 0.0f; float dmse = 0.0f; + float px_count = 0.0f; // Loop through each pixel. for (int y = y_start; y < y_start + block_h && y < height; y++) { @@ -63,25 +74,28 @@ void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi, float px_diff = source->y_buffer[y * source->y_stride + x] - recon->y_buffer[y * recon->y_stride + x]; dmse += px_diff * px_diff; + px_count += 1.0f; } } - for (int y = y_start; y < y_start + block_h && y < height; y += 2) { - for (int x = x_start; x < x_start + block_w && x < width; x += 2) { - const int src_px_index = y / 2 * source->uv_stride + x / 2; - const int recon_px_index = y / 2 * recon->uv_stride + x / 2; + const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y), + (height + ss_y) >> ss_y); + for (int y = y_start >> ss_y; y < y_end; y++) { + const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x), + (width + ss_x) >> ss_x); + for (int x = x_start >> ss_x; x < x_end; x++) { + const int src_px_index = y * source->uv_stride + x; + const int recon_px_index = y * recon->uv_stride + x; const float px_diff_u = (float)(source->u_buffer[src_px_index] - recon->u_buffer[recon_px_index]); const float px_diff_v = (float)(source->v_buffer[src_px_index] - recon->v_buffer[recon_px_index]); dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v; + px_count += 2.0f; } } dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f); - dmse = dmse / (2.0f * (float)block_w * (float)block_h); - // 'K' is used to balance the rate-distortion distribution between PSNR - // and Butteraugli. - const double K = 0.4; + dmse = dmse / px_count; const float eps = 0.01f; double weight; if (dbutteraugli < eps || dmse < eps) { @@ -166,10 +180,12 @@ static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int width, int height) { copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width, height); + const int width_uv = (width + src->subsampling_x) >> src->subsampling_x; + const int height_uv = (height + src->subsampling_y) >> src->subsampling_y; copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, - width / 2, height / 2); + width_uv, height_uv); copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, - width / 2, height / 2); + width_uv, height_uv); } static void zero_plane(uint8_t *dst, int dst_stride, int h) { @@ -192,9 +208,11 @@ void av1_setup_butteraugli_source(AV1_COMP *cpi) { const int width = cpi->source->y_crop_width; const int height = cpi->source->y_crop_height; const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; if (dst->buffer_alloc_sz == 0) { aom_alloc_frame_buffer( - dst, width, height, 1, 1, cm->seq_params.use_highbitdepth, + dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); } av1_copy_and_extend_frame(cpi->source, dst); @@ -202,8 +220,8 @@ void av1_setup_butteraugli_source(AV1_COMP *cpi) { YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source; if (resized_dst->buffer_alloc_sz == 0) { aom_alloc_frame_buffer( - resized_dst, width / resize_factor, height / resize_factor, 1, 1, - cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels, + resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); } av1_resize_and_extend_frame_nonnormative(cpi->source, resized_dst, bit_depth, @@ -215,25 +233,86 @@ void av1_setup_butteraugli_source(AV1_COMP *cpi) { aom_clear_system_state(); } -void av1_restore_butteraugli_source(AV1_COMP *cpi) { +void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) { aom_clear_system_state(); av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source); AV1_COMMON *const cm = &cpi->common; const int width = cpi->source->y_crop_width; const int height = cpi->source->y_crop_height; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; YV12_BUFFER_CONFIG resized_recon; memset(&resized_recon, 0, sizeof(resized_recon)); aom_alloc_frame_buffer( - &resized_recon, width / resize_factor, height / resize_factor, 1, 1, - cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels, + &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor, height / resize_factor); set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source, - &resized_recon); + &resized_recon, K); cpi->butteraugli_info.recon_set = true; aom_free_frame_buffer(&resized_recon); aom_clear_system_state(); } + +void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const QuantizationCfg *const q_cfg = &oxcf->q_cfg; + const int q_index = 96; + aom_clear_system_state(); + + // Setup necessary params for encoding, including frame source, etc. + if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi); + av1_set_frame_size(cpi, cm->superres_upscaled_width, + cm->superres_upscaled_height); + + cpi->source = + av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source, + cm->features.interp_filter, 0, false, false); + if (cpi->unscaled_last_source != NULL) { + cpi->last_source = av1_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + cm->features.interp_filter, 0, false, false); + } + + av1_setup_butteraugli_source(cpi); + av1_setup_frame(cpi); + + if (cm->seg.enabled) { + if (!cm->seg.update_data && cm->prev_frame) { + segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + cm->seg.enabled = cm->prev_frame->seg.enabled; + } else { + av1_calculate_segdata(&cm->seg); + } + } else { + memset(&cm->seg, 0, sizeof(cm->seg)); + } + segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + cm->cur_frame->seg.enabled = cm->seg.enabled; + + const PARTITION_SEARCH_TYPE partition_search_type = + cpi->sf.part_sf.partition_search_type; + const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size; + // Enable a quicker pass by uncommenting the following lines: + // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION; + // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32; + + av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index, + q_cfg->enable_chroma_deltaq); + av1_set_speed_features_qindex_dependent(cpi, oxcf->speed); + if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq) + av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params, + cm->seq_params->bit_depth); + + av1_set_variance_partition_thresholds(cpi, q_index, 0); + av1_encode_frame(cpi); + + av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3); + cpi->sf.part_sf.partition_search_type = partition_search_type; + cpi->sf.part_sf.fixed_partition_size = fixed_partition_size; +} diff --git a/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.h b/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.h index a4af31c718..7b7b0b64d3 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.h +++ b/third_party/libaom/source/libaom/av1/encoder/tune_butteraugli.h @@ -38,6 +38,10 @@ void av1_setup_butteraugli_recon(AV1_COMP *cpi, void av1_setup_butteraugli_source(AV1_COMP *cpi); -void av1_restore_butteraugli_source(AV1_COMP *cpi); +// 'K' is used to balance the rate-distortion distribution between PSNR +// and Butteraugli. +void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K); + +void av1_setup_butteraugli_rdmult(AV1_COMP *cpi); #endif // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_ diff --git a/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.c b/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.c index f5b6129407..0c28cebefa 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.c +++ b/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.c @@ -15,9 +15,7 @@ #include "aom_ports/system_state.h" #include "av1/encoder/extend.h" #include "av1/encoder/rdopt.h" -#if CONFIG_USE_VMAF_RC #include "config/aom_scale_rtcd.h" -#endif static const double kBaselineVmaf = 97.42773; @@ -89,9 +87,9 @@ static unsigned int residual_variance(const AV1_COMP *cpi, assert(y_stride == ref->y_stride); const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width; const int mv_offset = ref_mv.row * y_stride + ref_mv.col; - const unsigned int var = - cpi->fn_ptr[block_size].vf(ref->y_buffer + y_offset + mv_offset, y_stride, - src->y_buffer + y_offset, y_stride, sse); + const unsigned int var = cpi->ppi->fn_ptr[block_size].vf( + ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset, + y_stride, sse); return var; } @@ -117,7 +115,7 @@ static double frame_average_variance(const AV1_COMP *const cpi, buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y; buf.stride = y_stride; - if (cpi->common.seq_params.use_highbitdepth) { + if (cpi->common.seq_params->use_highbitdepth) { assert(frame->flags & YV12_FLAG_HIGHBITDEPTH); var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size, bit_depth); @@ -234,7 +232,7 @@ static AOM_INLINE void unsharp(const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *blurred, const YV12_BUFFER_CONFIG *dst, double amount) { const int bit_depth = cpi->td.mb.e_mbd.bd; - if (cpi->common.seq_params.use_highbitdepth) { + if (cpi->common.seq_params->use_highbitdepth) { assert(source->flags & YV12_FLAG_HIGHBITDEPTH); assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH); assert(dst->flags & YV12_FLAG_HIGHBITDEPTH); @@ -294,38 +292,27 @@ static AOM_INLINE void gaussian_blur(const int bit_depth, } static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi, -#if CONFIG_USE_VMAF_RC - VmafContext *vmaf_context, - int *vmaf_cal_index, -#endif double source_variance, YV12_BUFFER_CONFIG *const source, YV12_BUFFER_CONFIG *const sharpened) { const int bit_depth = cpi->td.mb.e_mbd.bd; + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; double new_vmaf; -#if CONFIG_USE_VMAF_RC - aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, source, - sharpened, bit_depth, *vmaf_cal_index, &new_vmaf); - (*vmaf_cal_index)++; -#else - aom_calc_vmaf(cpi->oxcf.tune_cfg.vmaf_model_path, source, sharpened, - bit_depth, &new_vmaf); -#endif + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth, + cal_vmaf_neg, &new_vmaf); const double sharpened_var = frame_average_variance(cpi, sharpened); return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf); } static double find_best_frame_unsharp_amount_loop( - const AV1_COMP *const cpi, -#if CONFIG_USE_VMAF_RC - VmafContext *vmaf_context, int *vmaf_cal_index, -#endif - YV12_BUFFER_CONFIG *const source, YV12_BUFFER_CONFIG *const blurred, - YV12_BUFFER_CONFIG *const sharpened, double best_vmaf, - const double baseline_variance, const double unsharp_amount_start, - const double step_size, const int max_loop_count, const double max_amount) { + const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source, + YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened, + double best_vmaf, const double baseline_variance, + const double unsharp_amount_start, const double step_size, + const int max_loop_count, const double max_amount) { const double min_amount = 0.0; int loop_count = 0; double approx_vmaf = best_vmaf; @@ -335,11 +322,7 @@ static double find_best_frame_unsharp_amount_loop( unsharp_amount += step_size; if (unsharp_amount > max_amount || unsharp_amount < min_amount) break; unsharp(cpi, source, blurred, sharpened, unsharp_amount); - approx_vmaf = cal_approx_vmaf(cpi, -#if CONFIG_USE_VMAF_RC - vmaf_context, vmaf_cal_index, -#endif - baseline_variance, source, sharpened); + approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened); loop_count++; } while (approx_vmaf > best_vmaf && loop_count < max_loop_count); @@ -358,73 +341,43 @@ static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi, const AV1_COMMON *const cm = &cpi->common; const int width = source->y_width; const int height = source->y_height; -#if CONFIG_USE_VMAF_RC - VmafContext *vmaf_context; - aom_init_vmaf_context_rc( - &vmaf_context, cpi->vmaf_info.vmaf_model, - cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN); - int vmaf_cal_index = 0; -#endif YV12_BUFFER_CONFIG sharpened; memset(&sharpened, 0, sizeof(sharpened)); aom_alloc_frame_buffer( - &sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + &sharpened, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); const double baseline_variance = frame_average_variance(cpi, source); double unsharp_amount; if (unsharp_amount_start <= step_size) { unsharp_amount = find_best_frame_unsharp_amount_loop( - cpi, -#if CONFIG_USE_VMAF_RC - vmaf_context, &vmaf_cal_index, -#endif - source, blurred, &sharpened, 0.0, baseline_variance, 0.0, step_size, - max_loop_count, max_filter_amount); + cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0, + step_size, max_loop_count, max_filter_amount); } else { double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start; double v0, v1; unsharp(cpi, source, blurred, &sharpened, a0); - v0 = cal_approx_vmaf(cpi, -#if CONFIG_USE_VMAF_RC - vmaf_context, &vmaf_cal_index, -#endif - baseline_variance, source, &sharpened); + v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); unsharp(cpi, source, blurred, &sharpened, a1); - v1 = cal_approx_vmaf(cpi, -#if CONFIG_USE_VMAF_RC - vmaf_context, &vmaf_cal_index, -#endif - baseline_variance, source, &sharpened); + v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened); if (fabs(v0 - v1) < 0.01) { unsharp_amount = a0; } else if (v0 > v1) { unsharp_amount = find_best_frame_unsharp_amount_loop( - cpi, -#if CONFIG_USE_VMAF_RC - vmaf_context, &vmaf_cal_index, -#endif - source, blurred, &sharpened, v0, baseline_variance, a0, -step_size, - max_loop_count, max_filter_amount); + cpi, source, blurred, &sharpened, v0, baseline_variance, a0, + -step_size, max_loop_count, max_filter_amount); } else { unsharp_amount = find_best_frame_unsharp_amount_loop( - cpi, -#if CONFIG_USE_VMAF_RC - vmaf_context, &vmaf_cal_index, -#endif - source, blurred, &sharpened, v1, baseline_variance, a1, step_size, - max_loop_count, max_filter_amount); + cpi, source, blurred, &sharpened, v1, baseline_variance, a1, + step_size, max_loop_count, max_filter_amount); } } aom_free_frame_buffer(&sharpened); -#if CONFIG_USE_VMAF_RC - aom_close_vmaf_context_rc(vmaf_context); -#endif return unsharp_amount; } -#if CONFIG_USE_VMAF_RC void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source) { aom_clear_system_state(); @@ -433,9 +386,9 @@ void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi, const int width = source->y_width; const int height = source->y_height; - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = - AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1); + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); const double best_frame_unsharp_amount = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); @@ -444,15 +397,15 @@ void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi, YV12_BUFFER_CONFIG blurred; memset(&blurred, 0, sizeof(blurred)); aom_alloc_frame_buffer( - &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + &blurred, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); gaussian_blur(bit_depth, source, &blurred); unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); aom_free_frame_buffer(&blurred); aom_clear_system_state(); } -#endif void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source) { @@ -466,19 +419,21 @@ void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi, memset(&source_extended, 0, sizeof(source_extended)); memset(&blurred, 0, sizeof(blurred)); aom_alloc_frame_buffer( - &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth, + &source_extended, width, height, source->subsampling_x, + source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); aom_alloc_frame_buffer( - &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + &blurred, width, height, source->subsampling_x, source->subsampling_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); av1_copy_and_extend_frame(source, &source_extended); gaussian_blur(bit_depth, &source_extended, &blurred); aom_free_frame_buffer(&source_extended); - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = - AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1); + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); const double last_frame_unsharp_amount = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); @@ -500,24 +455,27 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, const int width = source->y_width; const int height = source->y_height; const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = source->subsampling_x; + const int ss_y = source->subsampling_y; YV12_BUFFER_CONFIG source_extended, blurred; memset(&blurred, 0, sizeof(blurred)); memset(&source_extended, 0, sizeof(source_extended)); aom_alloc_frame_buffer( - &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); - aom_alloc_frame_buffer( - &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth, + &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); av1_copy_and_extend_frame(source, &source_extended); gaussian_blur(bit_depth, &source_extended, &blurred); aom_free_frame_buffer(&source_extended); - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = - AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1); + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); const double last_frame_unsharp_amount = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); @@ -540,12 +498,14 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, YV12_BUFFER_CONFIG source_block, blurred_block; memset(&source_block, 0, sizeof(source_block)); memset(&blurred_block, 0, sizeof(blurred_block)); - aom_alloc_frame_buffer( - &source_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); - aom_alloc_frame_buffer( - &blurred_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); + aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { @@ -555,7 +515,7 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, const int block_height = AOMMIN(height - row_offset_y, block_h); const int index = col + row * num_cols; - if (cm->seq_params.use_highbitdepth) { + if (cm->seq_params->use_highbitdepth) { assert(source->flags & YV12_FLAG_HIGHBITDEPTH); assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH); uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + @@ -624,7 +584,7 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, const int block_height = AOMMIN(source->y_height - row_offset_y, block_h); const int index = col + row * num_cols; - if (cm->seq_params.use_highbitdepth) { + if (cm->seq_params->use_highbitdepth) { assert(source->flags & YV12_FLAG_HIGHBITDEPTH); assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH); uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) + @@ -654,93 +614,6 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, aom_clear_system_state(); } -#if !CONFIG_USE_VMAF_RC -typedef struct FrameData { - const YV12_BUFFER_CONFIG *source, *blurred; - int block_w, block_h, num_rows, num_cols, row, col, bit_depth; -} FrameData; - -// A callback function used to pass data to VMAF. -// Returns 0 after reading a frame. -// Returns 2 when there is no more frame to read. -static int update_frame(float *ref_data, float *main_data, float *temp_data, - int stride, void *user_data) { - FrameData *frames = (FrameData *)user_data; - const int width = frames->source->y_width; - const int height = frames->source->y_height; - const int row = frames->row; - const int col = frames->col; - const int num_rows = frames->num_rows; - const int num_cols = frames->num_cols; - const int block_w = frames->block_w; - const int block_h = frames->block_h; - const YV12_BUFFER_CONFIG *source = frames->source; - const YV12_BUFFER_CONFIG *blurred = frames->blurred; - const int bit_depth = frames->bit_depth; - const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8)); - (void)temp_data; - stride /= (int)sizeof(*ref_data); - - for (int i = 0; i < height; ++i) { - float *ref, *main; - ref = ref_data + i * stride; - main = main_data + i * stride; - if (source->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *src; - src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride; - for (int j = 0; j < width; ++j) { - ref[j] = main[j] = scale_factor * (float)src[j]; - } - } else { - uint8_t *src; - src = source->y_buffer + i * source->y_stride; - for (int j = 0; j < width; ++j) { - ref[j] = main[j] = (float)src[j]; - } - } - } - if (row < num_rows && col < num_cols) { - // Set current block - const int row_offset = row * block_h; - const int col_offset = col * block_w; - const int block_width = AOMMIN(width - col_offset, block_w); - const int block_height = AOMMIN(height - row_offset, block_h); - - float *main_buf = main_data + col_offset + row_offset * stride; - if (source->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) + - row_offset * blurred->y_stride + col_offset; - for (int i = 0; i < block_height; ++i) { - for (int j = 0; j < block_width; ++j) { - main_buf[j] = scale_factor * (float)blurred_buf[j]; - } - main_buf += stride; - blurred_buf += blurred->y_stride; - } - } else { - uint8_t *blurred_buf = - blurred->y_buffer + row_offset * blurred->y_stride + col_offset; - for (int i = 0; i < block_height; ++i) { - for (int j = 0; j < block_width; ++j) { - main_buf[j] = (float)blurred_buf[j]; - } - main_buf += stride; - blurred_buf += blurred->y_stride; - } - } - - frames->col++; - if (frames->col >= num_cols) { - frames->col = 0; - frames->row++; - } - return 0; - } else { - return 2; - } -} -#endif - void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; const int y_width = cpi->source->y_width; @@ -748,13 +621,15 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { const int resized_block_size = BLOCK_32X32; const int resize_factor = 2; const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cpi->source->subsampling_x; + const int ss_y = cpi->source->subsampling_y; aom_clear_system_state(); YV12_BUFFER_CONFIG resized_source; memset(&resized_source, 0, sizeof(resized_source)); aom_alloc_frame_buffer( - &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1, - cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels, + &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x, + ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source, bit_depth, av1_num_planes(cm)); @@ -770,42 +645,26 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { YV12_BUFFER_CONFIG blurred; memset(&blurred, 0, sizeof(blurred)); - aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1, - cm->seq_params.use_highbitdepth, + aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x, + ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); gaussian_blur(bit_depth, &resized_source, &blurred); -#if CONFIG_USE_VMAF_RC YV12_BUFFER_CONFIG recon; memset(&recon, 0, sizeof(recon)); - aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, 1, 1, - cm->seq_params.use_highbitdepth, + aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); aom_yv12_copy_frame(&resized_source, &recon, 1); VmafContext *vmaf_context; - aom_init_vmaf_context_rc( - &vmaf_context, cpi->vmaf_info.vmaf_model, - cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN); -#else - double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols)); - memset(scores, 0, sizeof(*scores) * (num_rows * num_cols)); - FrameData frame_data; - frame_data.source = &resized_source; - frame_data.blurred = &blurred; - frame_data.block_w = resized_block_w; - frame_data.block_h = resized_block_h; - frame_data.num_rows = num_rows; - frame_data.num_cols = num_cols; - frame_data.row = 0; - frame_data.col = 0; - frame_data.bit_depth = bit_depth; - aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.tune_cfg.vmaf_model_path, - update_frame, resized_y_width, resized_y_height, - bit_depth, scores); -#endif + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg); + unsigned int *sses = aom_malloc(sizeof(*sses) * (num_rows * num_cols)); + memset(sses, 0, sizeof(*sses) * (num_rows * num_cols)); // Loop through each 'block_size' block. for (int row = 0; row < num_rows; ++row) { @@ -820,15 +679,14 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { uint8_t *const blurred_buf = blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y; - unsigned int sse; - cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride, - blurred_buf, blurred.y_stride, &sse); + cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride, + blurred_buf, blurred.y_stride, + &sses[index]); -#if CONFIG_USE_VMAF_RC uint8_t *const recon_buf = recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y; // Set recon buf - if (cpi->common.seq_params.use_highbitdepth) { + if (cpi->common.seq_params->use_highbitdepth) { highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride, CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride, CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, @@ -839,13 +697,11 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { resized_block_w, resized_block_h, 0.0); } - double vmaf; - aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, - &resized_source, &recon, bit_depth, index, - &vmaf); + aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth, + index); // Restore recon buf - if (cpi->common.seq_params.use_highbitdepth) { + if (cpi->common.seq_params->use_highbitdepth) { highbd_unsharp_rect( CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride, CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride, @@ -856,13 +712,18 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { resized_source.y_stride, recon_buf, recon.y_stride, resized_block_w, resized_block_h, 0.0); } -#else - const double vmaf = scores[index]; -#endif + } + } + aom_flush_vmaf_context(vmaf_context); + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + const double vmaf = aom_calc_vmaf_at_index( + vmaf_context, cpi->vmaf_info.vmaf_model, index); const double dvmaf = kBaselineVmaf - vmaf; const double mse = - (double)sse / (double)(resized_y_width * resized_y_height); + (double)sses[index] / (double)(resized_y_width * resized_y_height); double weight; const double eps = 0.01 / (num_rows * num_cols); if (dvmaf < eps || mse < eps) { @@ -879,11 +740,8 @@ void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) { aom_free_frame_buffer(&resized_source); aom_free_frame_buffer(&blurred); -#if CONFIG_USE_VMAF_RC - aom_close_vmaf_context_rc(vmaf_context); -#else - aom_free(scores); -#endif + aom_close_vmaf_context(vmaf_context); + aom_free(sses); aom_clear_system_state(); } @@ -967,27 +825,32 @@ static double calc_vmaf_motion_score(const AV1_COMP *const cpi, const int y_height = cur->y_height; YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next; const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = cur->subsampling_x; + const int ss_y = cur->subsampling_y; memset(&blurred_cur, 0, sizeof(blurred_cur)); memset(&blurred_last, 0, sizeof(blurred_last)); memset(&blurred_next, 0, sizeof(blurred_next)); - aom_alloc_frame_buffer( - &blurred_cur, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); - aom_alloc_frame_buffer( - &blurred_last, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); - aom_alloc_frame_buffer( - &blurred_next, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); + aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); + aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); + aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); gaussian_blur(bit_depth, cur, &blurred_cur); gaussian_blur(bit_depth, last, &blurred_last); if (next) gaussian_blur(bit_depth, next, &blurred_next); double motion1, motion2 = 65536.0; - if (cm->seq_params.use_highbitdepth) { + if (cm->seq_params->use_highbitdepth) { assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH); assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH); const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8)); @@ -1026,9 +889,9 @@ static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi, YV12_BUFFER_CONFIG **last, YV12_BUFFER_CONFIG **next) { const AV1_COMMON *const cm = &cpi->common; - const GF_GROUP *gf_group = &cpi->gf_group; + const GF_GROUP *gf_group = &cpi->ppi->gf_group; const int src_index = - cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[gf_group->index]; + cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index]; struct lookahead_entry *last_entry = av1_lookahead_peek( cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage); struct lookahead_entry *next_entry = av1_lookahead_peek( @@ -1046,9 +909,9 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) { return current_qindex; } aom_clear_system_state(); - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = - AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1); + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); const double last_frame_ysse = get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth); const double last_frame_vmaf = @@ -1065,7 +928,7 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) { } YV12_BUFFER_CONFIG *cur_buf = cpi->source; if (cm->show_frame == 0) { - const int src_index = gf_group->arf_src_offset[gf_group->index]; + const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; struct lookahead_entry *cur_entry = av1_lookahead_peek( cpi->ppi->lookahead, src_index, cpi->compressor_stage); cur_buf = &cur_entry->img; @@ -1084,7 +947,8 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) { const double dsse = dvmaf * approx_sse / approx_dvmaf; const double beta = approx_sse / (dsse + approx_sse); - const int offset = av1_get_deltaq_offset(cpi, current_qindex, beta); + const int offset = + av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta); int qindex = current_qindex + offset; qindex = AOMMIN(qindex, MAXQ); @@ -1094,23 +958,23 @@ int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) { return qindex; } -#if CONFIG_USE_VMAF_RC static AOM_INLINE double cal_approx_score( - AV1_COMP *const cpi, VmafContext *vmaf_context, int vmaf_cal_index, - double src_variance, double new_variance, double src_score, - YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon_sharpened) { + AV1_COMP *const cpi, double src_variance, double new_variance, + double src_score, YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const recon_sharpened) { double score; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; - aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, src, - recon_sharpened, bit_depth, vmaf_cal_index, &score); + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth, + cal_vmaf_neg, &score); return src_variance / new_variance * (score - src_score); } static double find_best_frame_unsharp_amount_loop_neg( - AV1_COMP *const cpi, VmafContext *vmaf_context, double src_variance, - double base_score, YV12_BUFFER_CONFIG *const src, - YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref, - YV12_BUFFER_CONFIG *const src_blurred, + AV1_COMP *const cpi, double src_variance, double base_score, + YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon, + YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred, YV12_BUFFER_CONFIG *const recon_blurred, YV12_BUFFER_CONFIG *const src_sharpened, YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs, @@ -1120,7 +984,6 @@ static double find_best_frame_unsharp_amount_loop_neg( int loop_count = 0; double approx_score = best_score; double unsharp_amount = unsharp_amount_start; - int vmaf_cal_index = 3; do { best_score = approx_score; @@ -1130,9 +993,8 @@ static double find_best_frame_unsharp_amount_loop_neg( unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount); const double new_variance = residual_frame_average_variance(cpi, src_sharpened, ref, mvs); - approx_score = - cal_approx_score(cpi, vmaf_context, vmaf_cal_index++, src_variance, - new_variance, base_score, src, recon_sharpened); + approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score, + src, recon_sharpened); loop_count++; } while (approx_score > best_score && loop_count < max_loop_count); @@ -1143,11 +1005,11 @@ static double find_best_frame_unsharp_amount_loop_neg( } static double find_best_frame_unsharp_amount_neg( - AV1_COMP *const cpi, VmafContext *vmaf_context, - YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon, - YV12_BUFFER_CONFIG *const ref, double base_score, - const double unsharp_amount_start, const double step_size, - const int max_loop_count, const double max_filter_amount) { + AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src, + YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref, + double base_score, const double unsharp_amount_start, + const double step_size, const int max_loop_count, + const double max_filter_amount) { FULLPEL_MV *mvs = NULL; const double src_variance = residual_frame_average_variance(cpi, src, ref, mvs); @@ -1156,22 +1018,28 @@ static double find_best_frame_unsharp_amount_neg( const int width = recon->y_width; const int height = recon->y_height; const int bit_depth = cpi->td.mb.e_mbd.bd; + const int ss_x = recon->subsampling_x; + const int ss_y = recon->subsampling_y; + YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened; memset(&recon_sharpened, 0, sizeof(recon_sharpened)); memset(&src_sharpened, 0, sizeof(src_sharpened)); memset(&recon_blurred, 0, sizeof(recon_blurred)); memset(&src_blurred, 0, sizeof(src_blurred)); + aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); + aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); + aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y, + cm->seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, + cm->features.byte_alignment); aom_alloc_frame_buffer( - &recon_sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); - aom_alloc_frame_buffer( - &src_sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); - aom_alloc_frame_buffer( - &recon_blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment); - aom_alloc_frame_buffer( - &src_blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth, + &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment); gaussian_blur(bit_depth, recon, &recon_blurred); @@ -1181,32 +1049,28 @@ static double find_best_frame_unsharp_amount_neg( unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start); const double variance_start = residual_frame_average_variance(cpi, &src_sharpened, ref, mvs); - const double score_start = - cal_approx_score(cpi, vmaf_context, 1, src_variance, variance_start, - base_score, src, &recon_sharpened); + const double score_start = cal_approx_score( + cpi, src_variance, variance_start, base_score, src, &recon_sharpened); const double unsharp_amount_next = unsharp_amount_start + step_size; unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next); unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next); const double variance_next = residual_frame_average_variance(cpi, &src_sharpened, ref, mvs); - const double score_next = - cal_approx_score(cpi, vmaf_context, 2, src_variance, variance_next, - base_score, src, &recon_sharpened); + const double score_next = cal_approx_score(cpi, src_variance, variance_next, + base_score, src, &recon_sharpened); double unsharp_amount; if (score_next > score_start) { unsharp_amount = find_best_frame_unsharp_amount_loop_neg( - cpi, vmaf_context, src_variance, base_score, src, recon, ref, - &src_blurred, &recon_blurred, &src_sharpened, &recon_sharpened, mvs, - score_next, unsharp_amount_next, step_size, max_loop_count, - max_filter_amount); + cpi, src_variance, base_score, src, recon, ref, &src_blurred, + &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next, + unsharp_amount_next, step_size, max_loop_count, max_filter_amount); } else { unsharp_amount = find_best_frame_unsharp_amount_loop_neg( - cpi, vmaf_context, src_variance, base_score, src, recon, ref, - &src_blurred, &recon_blurred, &src_sharpened, &recon_sharpened, mvs, - score_start, unsharp_amount_start, -step_size, max_loop_count, - max_filter_amount); + cpi, src_variance, base_score, src, recon, ref, &src_blurred, + &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start, + unsharp_amount_start, -step_size, max_loop_count, max_filter_amount); } aom_free_frame_buffer(&recon_sharpened); @@ -1216,29 +1080,21 @@ static double find_best_frame_unsharp_amount_neg( aom_free(mvs); return unsharp_amount; } -#endif // CONFIG_USE_VMAF_RC void av1_update_vmaf_curve(AV1_COMP *cpi) { YV12_BUFFER_CONFIG *source = cpi->source; YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; const int bit_depth = cpi->td.mb.e_mbd.bd; - const GF_GROUP *const gf_group = &cpi->gf_group; + const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = - AOMMIN(gf_group->layer_depth[gf_group->index], MAX_ARF_LAYERS - 1); -#if CONFIG_USE_VMAF_RC + AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1); double base_score; - VmafContext *vmaf_context; - aom_init_vmaf_context_rc( - &vmaf_context, cpi->vmaf_info.vmaf_model, - cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN); - aom_calc_vmaf_at_index_rc(vmaf_context, cpi->vmaf_info.vmaf_model, source, - recon, bit_depth, 0, &base_score); + const bool cal_vmaf_neg = + cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; + aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth, + cal_vmaf_neg, &base_score); cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score; -#else - aom_calc_vmaf(cpi->oxcf.tune_cfg.vmaf_model_path, source, recon, bit_depth, - &cpi->vmaf_info.last_frame_vmaf[layer_depth]); -#endif // CONFIG_USE_VMAF_RC - if (cpi->common.seq_params.use_highbitdepth) { + if (cpi->common.seq_params->use_highbitdepth) { assert(source->flags & YV12_FLAG_HIGHBITDEPTH); assert(recon->flags & YV12_FLAG_HIGHBITDEPTH); cpi->vmaf_info.last_frame_ysse[layer_depth] = @@ -1248,7 +1104,6 @@ void av1_update_vmaf_curve(AV1_COMP *cpi) { (double)aom_get_y_sse(source, recon); } -#if CONFIG_USE_VMAF_RC if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { YV12_BUFFER_CONFIG *last, *next; get_neighbor_frames(cpi, &last, &next); @@ -1256,10 +1111,8 @@ void av1_update_vmaf_curve(AV1_COMP *cpi) { get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); const int max_loop_count = 5; cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] = - find_best_frame_unsharp_amount_neg( - cpi, vmaf_context, source, recon, last, base_score, - best_unsharp_amount_start, 0.025, max_loop_count, 1.01); + find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score, + best_unsharp_amount_start, 0.025, + max_loop_count, 1.01); } - aom_close_vmaf_context_rc(vmaf_context); -#endif // CONFIG_USE_VMAF_RC } diff --git a/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.h b/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.h index 01c3068bf0..4625fb9061 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.h +++ b/third_party/libaom/source/libaom/av1/encoder/tune_vmaf.h @@ -36,10 +36,8 @@ typedef struct { // Stores the origial qindex before scaling. int original_qindex; -#if CONFIG_USE_VMAF_RC // VMAF model used in VMAF caculations. VmafModel *vmaf_model; -#endif } TuneVMAFInfo; typedef struct AV1_COMP AV1_COMP; @@ -48,9 +46,7 @@ void av1_vmaf_blk_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source); void av1_vmaf_frame_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source); -#ifdef CONFIG_USE_VMAF_RC void av1_vmaf_neg_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source); -#endif void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi); diff --git a/third_party/libaom/source/libaom/av1/encoder/tx_search.c b/third_party/libaom/source/libaom/av1/encoder/tx_search.c index 30aac0a349..e65b70f788 100644 --- a/third_party/libaom/source/libaom/av1/encoder/tx_search.c +++ b/third_party/libaom/source/libaom/av1/encoder/tx_search.c @@ -618,7 +618,7 @@ static AOM_INLINE void get_energy_distribution_fine( assert(bw <= 32); assert(bh <= 32); assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15); - if (cpi->common.seq_params.use_highbitdepth) { + if (cpi->common.seq_params->use_highbitdepth) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (int i = 0; i < bh; ++i) @@ -643,43 +643,43 @@ static AOM_INLINE void get_energy_distribution_fine( const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index; assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]); assert(block_size_high[bsize] == 4 * block_size_high[subsize]); - cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); - cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, - &esq[1]); - cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, - &esq[2]); - cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, - dst_stride, &esq[3]); + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[1]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[2]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[3]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; - cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); - cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, - &esq[5]); - cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, - &esq[6]); - cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, - dst_stride, &esq[7]); + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[5]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[6]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[7]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; - cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); - cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, - &esq[9]); - cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, - &esq[10]); - cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, - dst_stride, &esq[11]); + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[9]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[10]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[11]); src += bh / 4 * src_stride; dst += bh / 4 * dst_stride; - cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); - cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride, - &esq[13]); - cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride, - &esq[14]); - cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, - dst_stride, &esq[15]); + cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, + dst_stride, &esq[13]); + cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, + dst_stride, &esq[14]); + cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4, + dst_stride, &esq[15]); } double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + @@ -769,13 +769,13 @@ static AOM_INLINE void get_2x2_normalized_sses_and_sads( if (sse_norm_arr) { unsigned int this_sse; - cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, - dst_stride, &this_sse); + cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst, + dst_stride, &this_sse); sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half; } if (sad_norm_arr) { - const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf( + const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf( this_src, src_stride, this_dst, dst_stride); sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half; } @@ -832,11 +832,11 @@ static AOM_INLINE void PrintTransformUnitStats( const uint8_t *const dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2]; unsigned int sse; - cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); const double sse_norm = (double)sse / num_samples; const unsigned int sad = - cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride); + cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride); const double sad_norm = (double)sad / num_samples; fprintf(fout, " %g %g", sse_norm, sad_norm); @@ -905,8 +905,8 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { if (x->skip_chroma_rd && plane) continue; - cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, - &sse); + cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, &sse); total_sse += sse; } total_sse <<= 4; @@ -1030,7 +1030,7 @@ static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi, const double sse_norm = (double)sse / num_samples; const unsigned int sad = - cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride); + cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride); const double sad_norm = (double)sad / (1 << num_pels_log2_lookup[plane_bsize]); @@ -1183,7 +1183,7 @@ static unsigned pixel_dist_visible_only( unsigned sse; if (txb_rows == visible_rows && txb_cols == visible_cols) { - cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); + cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse); return sse; } @@ -2024,9 +2024,15 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, assert(plane == 0); allowed_tx_mask = ext_tx_used_flag; int num_allowed = 0; - const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group); - const int *tx_type_probs = - cpi->frame_probs.tx_type_probs[update_type][tx_size]; + const FRAME_UPDATE_TYPE update_type = + get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index); + int *tx_type_probs; +#if CONFIG_FRAME_PARALLEL_ENCODE + tx_type_probs = + (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size]; +#else + tx_type_probs = (int *)cpi->frame_probs.tx_type_probs[update_type][tx_size]; +#endif int i; if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) { @@ -2097,25 +2103,8 @@ get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, #if CONFIG_RD_DEBUG static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane, - TX_SIZE tx_size, int blk_row, - int blk_col, int txb_coeff_cost) { - (void)blk_row; - (void)blk_col; - (void)tx_size; + int txb_coeff_cost) { rd_stats->txb_coeff_cost[plane] += txb_coeff_cost; - - { - const int txb_h = tx_size_high_unit[tx_size]; - const int txb_w = tx_size_wide_unit[tx_size]; - int idx, idy; - for (idy = 0; idy < txb_h; ++idy) - for (idx = 0; idx < txb_w; ++idx) - rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0; - - rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost; - } - assert(blk_row < TXB_COEFF_COST_MAP_SIZE); - assert(blk_col < TXB_COEFF_COST_MAP_SIZE); } #endif @@ -2674,8 +2663,7 @@ static AOM_INLINE void try_tx_block_no_split( RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse)); if (pick_skip_txfm) { #if CONFIG_RD_DEBUG - update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col, - zero_blk_rate - rd_stats->rate); + update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate); #endif // CONFIG_RD_DEBUG rd_stats->rate = zero_blk_rate; rd_stats->dist = rd_stats->sse; @@ -2720,11 +2708,12 @@ static AOM_INLINE void try_tx_block_split( x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1]; for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) { + const int offsetr = blk_row + r; + if (offsetr >= max_blocks_high) break; for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) { assert(blk_idx < 4); - const int offsetr = blk_row + r; const int offsetc = blk_col + c; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; + if (offsetc >= max_blocks_wide) continue; RD_STATS this_rd_stats; int this_cost_valid = 1; @@ -3173,8 +3162,7 @@ static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row, } #if CONFIG_RD_DEBUG - update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col, - this_rd_stats.rate); + update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate); #endif // CONFIG_RD_DEBUG av1_set_txb_context(x, plane, block, tx_size, a, l); @@ -3452,15 +3440,18 @@ static AOM_INLINE void tx_block_yrd( const int txb_width = tx_size_wide_unit[sub_txs]; const int txb_height = tx_size_high_unit[sub_txs]; const int step = txb_height * txb_width; + const int row_end = + AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row); + const int col_end = + AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col); RD_STATS pn_rd_stats; int64_t this_rd = 0; assert(txb_width > 0 && txb_height > 0); - for (int row = 0; row < tx_size_high_unit[tx_size]; row += txb_height) { - for (int col = 0; col < tx_size_wide_unit[tx_size]; col += txb_width) { - const int offsetr = blk_row + row; + for (int row = 0; row < row_end; row += txb_height) { + const int offsetr = blk_row + row; + for (int col = 0; col < col_end; col += txb_width) { const int offsetc = blk_col + col; - if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; av1_init_rd_stats(&pn_rd_stats); tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize, diff --git a/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.c b/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.c index 31b86abe64..884d0a9e8b 100644 --- a/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.c +++ b/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.c @@ -327,16 +327,8 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, const LV_MAP_EOB_COST *txb_eob_costs = &coeff_costs->eob_costs[eob_multi_size][plane_type]; - const int rshift = - (sharpness + - (cpi->oxcf.q_cfg.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4 - ? 7 - mbmi->segment_id - : 2) + - (cpi->oxcf.q_cfg.aq_mode != VARIANCE_AQ && - cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL && - cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0 - ? (3 - x->sb_energy_level) - : 0)); + const int rshift = sharpness + 2; + const int64_t rdmult = (((int64_t)x->rdmult * (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) + diff --git a/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.h b/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.h index e86caaa06e..70b322a2e1 100644 --- a/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.h +++ b/third_party/libaom/source/libaom/av1/encoder/txb_rdopt.h @@ -44,11 +44,11 @@ extern "C" { * skip flag (tx_skip) and the sign of DC coefficient (dc_sign). * \param[out] rate_cost The entropy cost of coding the transform block * after adjustment of coefficients. - * \param[in] sharpness When sharpness == 1, the function will be less - * aggressive toward lowering the magnitude of coefficients. + * \param[in] sharpness When sharpness > 0, the function will be less + * aggressive towards lowering the magnitude of coefficients. * In this way, the transform block will contain more high-frequency - coefficients - * and therefore preserve the sharpness of the reconstructed block. + * coefficients and therefore will preserve the sharpness of the reconstructed + * block. */ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, diff --git a/third_party/libaom/source/libaom/av1/encoder/var_based_part.c b/third_party/libaom/source/libaom/av1/encoder/var_based_part.c index a42be4553f..8907d0d0ba 100644 --- a/third_party/libaom/source/libaom/av1/encoder/var_based_part.c +++ b/third_party/libaom/source/libaom/av1/encoder/var_based_part.c @@ -341,7 +341,7 @@ static int64_t scale_part_thresh_content(int64_t threshold_base, int speed, static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q, int content_lowsumdiff, - int segment_id) { + int source_sad, int segment_id) { AV1_COMMON *const cm = &cpi->common; const int is_key_frame = frame_is_intra_only(cm); const int threshold_multiplier = is_key_frame ? 120 : 1; @@ -394,7 +394,6 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], scale_part_thresh_content(threshold_base, cpi->oxcf.speed, cm->width, cm->height, cpi->svc.non_reference_frame); #endif - thresholds[0] = threshold_base >> 1; thresholds[1] = threshold_base; thresholds[3] = threshold_base << cpi->oxcf.speed; @@ -436,20 +435,45 @@ static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], thresholds[2] = (5 * threshold_base) >> 1; } if (cpi->sf.rt_sf.force_large_partition_blocks) { + double weight; + const int win = 20; + if (current_qindex < QINDEX_LARGE_BLOCK_THR - win) + weight = 1.0; + else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win) + weight = 0.0; + else + weight = + 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win); + if (cm->width * cm->height > 640 * 480) { + for (int i = 0; i < 4; i++) { + thresholds[i] <<= 1; + } + } if (cm->width * cm->height <= 352 * 288) { thresholds[1] <<= 2; thresholds[2] <<= 5; thresholds[3] = INT32_MAX; - } else if (cm->width * cm->height > 640 * 480 && segment_id == 0) { + // Condition the increase of partition thresholds on the segment + // and the content. Avoid the increase for superblocks which have + // high source sad, unless the whole frame has very high motion + // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks + // have high source sad). + } else if (cm->width * cm->height > 640 * 480 && segment_id == 0 && + (source_sad != kHighSad || cpi->rc.avg_source_sad > 50000)) { thresholds[0] = (3 * thresholds[0]) >> 1; thresholds[3] = INT32_MAX; - if (current_qindex >= QINDEX_LARGE_BLOCK_THR) { - thresholds[1] <<= 1; - thresholds[2] <<= 1; + if (current_qindex > QINDEX_LARGE_BLOCK_THR) { + thresholds[1] = (int)((1 - weight) * (thresholds[1] << 1) + + weight * thresholds[1]); + thresholds[2] = (int)((1 - weight) * (thresholds[2] << 1) + + weight * thresholds[2]); } - } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0) { - thresholds[1] <<= 2; - thresholds[2] <<= 5; + } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0 && + (source_sad != kHighSad || cpi->rc.avg_source_sad > 50000)) { + thresholds[1] = + (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]); + thresholds[2] = + (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]); thresholds[3] = INT32_MAX; } } @@ -605,7 +629,7 @@ static AOM_INLINE void set_low_temp_var_flag( xd->mi[0]->mv[0].as_mv.col > -mv_thr && xd->mi[0]->mv[0].as_mv.row < mv_thr && xd->mi[0]->mv[0].as_mv.row > -mv_thr))) { - const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); if (is_small_sb) set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd, &(vt->split[0]), thresholds, mi_col, mi_row); @@ -621,7 +645,8 @@ void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q, if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) { return; } else { - set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0); + set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0, + 0); // The threshold below is not changed locally. cpi->vbp_info.threshold_minmax = 15 + (q >> 3); } @@ -643,10 +668,17 @@ static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x, get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); if (bs != BLOCK_INVALID) - uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf, - pd->dst.stride); - - x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2); + uv_sad = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride); + + if (uv_sad > (y_sad >> 1)) + x->color_sensitivity_sb[i - 1] = 1; + else if (uv_sad < (y_sad >> 3)) + x->color_sensitivity_sb[i - 1] = 0; + // Borderline case: to be refined at coding block level in nonrd_pickmode, + // for coding block size < sb_size. + else + x->color_sensitivity_sb[i - 1] = 2; } } @@ -658,7 +690,7 @@ static void fill_variance_tree_leaves( AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int is_key_frame = frame_is_intra_only(cm); - const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); const int num_64x64_blocks = is_small_sb ? 1 : 4; // TODO(kyslov) Bring back compute_minmax_variance with content type detection const int compute_minmax_variance = 0; @@ -772,7 +804,7 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int num_planes = av1_num_planes(cm); - const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it // is!! @@ -783,13 +815,13 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, // For non-SVC GOLDEN is another temporal reference. Check if it should be // used as reference for partitioning. - if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) && + if (!cpi->ppi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) && cpi->sf.rt_sf.use_nonrd_pick_mode) { yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); if (yv12_g && yv12_g != yv12) { av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes); - *y_sad_g = cpi->fn_ptr[bsize].sdf( + *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf( x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); } @@ -799,20 +831,20 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, get_ref_scale_factors(cm, LAST_FRAME), num_planes); mi->ref_frame[0] = LAST_FRAME; mi->ref_frame[1] = NONE_FRAME; - mi->bsize = cm->seq_params.sb_size; + mi->bsize = cm->seq_params->sb_size; mi->mv[0].as_int = 0; mi->interp_filters = av1_broadcast_interp_filter(BILINEAR); if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) { if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) { const MV dummy_mv = { 0, 0 }; - *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size, + *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &dummy_mv); } } if (*y_sad == UINT_MAX) { - *y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, - xd->plane[0].pre[0].stride); + *y_sad = cpi->ppi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); } // Pick the ref frame for partitioning, use golden frame only if its @@ -834,7 +866,7 @@ static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad, set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, - cm->seq_params.sb_size, AOM_PLANE_Y, + cm->seq_params->sb_size, AOM_PLANE_Y, AOM_PLANE_Y); } @@ -869,12 +901,12 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, int is_key_frame = (frame_is_intra_only(cm) || - (cpi->use_svc && + (cpi->ppi->use_svc && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); - assert(cm->seq_params.sb_size == BLOCK_64X64 || - cm->seq_params.sb_size == BLOCK_128X128); - const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + assert(cm->seq_params->sb_size == BLOCK_64X64 || + cm->seq_params->sb_size == BLOCK_128X128); + const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64); const int num_64x64_blocks = is_small_sb ? 1 : 4; unsigned int y_sad = UINT_MAX; @@ -900,10 +932,12 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, cyclic_refresh_segment_id_boosted(segment_id) && cpi->sf.rt_sf.use_nonrd_pick_mode) { int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex); - set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff, 1); + set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff, + x->content_state_sb.source_sad, 1); } else { set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex, - x->content_state_sb.low_sumdiff, 0); + x->content_state_sb.low_sumdiff, + x->content_state_sb.source_sad, 0); } // For non keyframes, disable 4x4 average for low resolution when speed = 8 @@ -1025,7 +1059,7 @@ int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, if (!is_key_frame && (max_var_32x32[m] - min_var_32x32[m]) > 3 * (thresholds[1] >> 3) && max_var_32x32[m] > thresholds[1] >> 1 && - (noise_level >= kMedium || cpi->use_svc || + (noise_level >= kMedium || cpi->ppi->use_svc || cpi->sf.rt_sf.force_large_partition_blocks || !cpi->sf.rt_sf.use_nonrd_pick_mode)) { force_split[1 + m] = 1; diff --git a/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_avx2.c b/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_avx2.c index b5477ec9ba..68509fa106 100644 --- a/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_avx2.c +++ b/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_avx2.c @@ -352,10 +352,16 @@ void av1_highbd_apply_temporal_filter_avx2( TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; - // Decay factors for non-local mean approach. - // Smaller q -> smaller filtering weight. + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); @@ -393,6 +399,7 @@ void av1_highbd_apply_temporal_filter_avx2( const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion diff --git a/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_sse2.c b/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_sse2.c index bbb3771543..1bfdaf72e1 100644 --- a/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_sse2.c +++ b/third_party/libaom/source/libaom/av1/encoder/x86/highbd_temporal_filter_sse2.c @@ -227,10 +227,16 @@ void av1_highbd_apply_temporal_filter_sse2( TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; - // Decay factors for non-local mean approach. - // Smaller q -> smaller filtering weight. + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); @@ -268,6 +274,7 @@ void av1_highbd_apply_temporal_filter_sse2( const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion diff --git a/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_avx2.c b/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_avx2.c index 72914e1781..8aa07641aa 100644 --- a/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_avx2.c +++ b/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_avx2.c @@ -238,10 +238,16 @@ void av1_apply_temporal_filter_avx2( TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; - // Decay factors for non-local mean approach. - // Smaller q -> smaller filtering weight. + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); @@ -277,6 +283,7 @@ void av1_apply_temporal_filter_avx2( const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion diff --git a/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_sse2.c b/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_sse2.c index d70792c644..26c3926dca 100644 --- a/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_sse2.c +++ b/third_party/libaom/source/libaom/av1/encoder/x86/temporal_filter_sse2.c @@ -215,10 +215,16 @@ void av1_apply_temporal_filter_sse2( TF_SEARCH_ERROR_NORM_WEIGHT); const double weight_factor = (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor; - // Decay factors for non-local mean approach. - // Smaller q -> smaller filtering weight. + // Adjust filtering based on q. + // Larger q -> stronger filtering -> larger weight. + // Smaller q -> weaker filtering -> smaller weight. double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2); q_decay = CLIP(q_decay, 1e-5, 1); + if (q_factor >= TF_QINDEX_CUTOFF) { + // Max q_factor is 255, therefore the upper bound of q_decay is 8. + // We do not need a clip here. + q_decay = 0.5 * pow((double)q_factor / 64, 2); + } // Smaller strength -> smaller filtering weight. double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2); s_decay = CLIP(s_decay, 1e-5, 1); @@ -254,6 +260,7 @@ void av1_apply_temporal_filter_sse2( const double inv_num_ref_pixels = 1.0 / num_ref_pixels; // Larger noise -> larger filtering weight. const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0); + // Decay factors for non-local mean approach. const double decay_factor = 1 / (n_decay * q_decay * s_decay); // Filter U-plane and V-plane using Y-plane. This is because motion diff --git a/third_party/libaom/source/libaom/common/args.c b/third_party/libaom/source/libaom/common/args.c index 64d6e03383..ed622943e3 100644 --- a/third_party/libaom/source/libaom/common/args.c +++ b/third_party/libaom/source/libaom/common/args.c @@ -92,7 +92,6 @@ int parse_cfg(const char *file, cfg_options_t *config) { GET_PARAMS(disable_intrabc); GET_PARAMS(disable_cfl); GET_PARAMS(disable_smooth_intra); - GET_PARAMS(disable_diagonal_intra); GET_PARAMS(disable_filter_intra); GET_PARAMS(disable_dual_filter); GET_PARAMS(disable_intra_angle_delta); diff --git a/third_party/libaom/source/libaom/examples/aom_cx_set_ref.c b/third_party/libaom/source/libaom/examples/aom_cx_set_ref.c index 3aea2cfdd6..da36d9fe13 100644 --- a/third_party/libaom/source/libaom/examples/aom_cx_set_ref.c +++ b/third_party/libaom/source/libaom/examples/aom_cx_set_ref.c @@ -271,7 +271,11 @@ int main(int argc, char **argv) { printf("Using %s\n", aom_codec_iface_name(encoder)); +#if CONFIG_REALTIME_ONLY + res = aom_codec_enc_config_default(encoder, &cfg, 1); +#else res = aom_codec_enc_config_default(encoder, &cfg, 0); +#endif if (res) die_codec(&ecodec, "Failed to get default codec config."); cfg.g_w = info.frame_width; @@ -334,6 +338,12 @@ int main(int argc, char **argv) { die_codec(&ecodec, "Failed to set encoder reference frame"); printf(" <SET_REF>"); +#if CONFIG_REALTIME_ONLY + // Set cpu speed in encoder. + if (aom_codec_control(&ecodec, AOME_SET_CPUUSED, 7)) + die_codec(&ecodec, "Failed to set cpu speed"); +#endif + // If set_reference in decoder is commented out, the enc/dec mismatch // would be seen. if (test_decode) { diff --git a/third_party/libaom/source/libaom/examples/set_maps.c b/third_party/libaom/source/libaom/examples/set_maps.c index 69b4bccbe6..5a84faa565 100644 --- a/third_party/libaom/source/libaom/examples/set_maps.c +++ b/third_party/libaom/source/libaom/examples/set_maps.c @@ -129,6 +129,14 @@ int main(int argc, char **argv) { const int fps = 2; // TODO(dkovalev) add command line argument const double bits_per_pixel_per_frame = 0.067; +#if CONFIG_REALTIME_ONLY + const int usage = 1; + const int speed = 7; +#else + const int usage = 0; + const int speed = 2; +#endif + exec_name = argv[0]; if (argc != 6) die("Invalid number of arguments"); @@ -157,7 +165,7 @@ int main(int argc, char **argv) { printf("Using %s\n", aom_codec_iface_name(encoder)); - res = aom_codec_enc_config_default(encoder, &cfg, 0); + res = aom_codec_enc_config_default(encoder, &cfg, usage); if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; @@ -177,7 +185,7 @@ int main(int argc, char **argv) { if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) die("Failed to initialize encoder"); - if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2)) + if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed)) die_codec(&codec, "Failed to set cpu-used"); // Encode frames. diff --git a/third_party/libaom/source/libaom/examples/simple_encoder.c b/third_party/libaom/source/libaom/examples/simple_encoder.c index 682fe9842b..c026706555 100644 --- a/third_party/libaom/source/libaom/examples/simple_encoder.c +++ b/third_party/libaom/source/libaom/examples/simple_encoder.c @@ -163,6 +163,13 @@ int main(int argc, char **argv) { const char *infile_arg = NULL; const char *outfile_arg = NULL; const char *keyframe_interval_arg = NULL; +#if CONFIG_REALTIME_ONLY + const int usage = 1; + const int speed = 7; +#else + const int usage = 0; + const int speed = 2; +#endif exec_name = argv[0]; @@ -204,7 +211,7 @@ int main(int argc, char **argv) { printf("Using %s\n", aom_codec_iface_name(encoder)); - res = aom_codec_enc_config_default(encoder, &cfg, 0); + res = aom_codec_enc_config_default(encoder, &cfg, usage); if (res) die_codec(&codec, "Failed to get default codec config."); cfg.g_w = info.frame_width; @@ -223,7 +230,7 @@ int main(int argc, char **argv) { if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) die("Failed to initialize encoder"); - if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2)) + if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed)) die_codec(&codec, "Failed to set cpu-used"); // Encode frames. diff --git a/third_party/libaom/source/libaom/examples/svc_encoder_rtc.c b/third_party/libaom/source/libaom/examples/svc_encoder_rtc.c index 87e3aa95f1..44bed38318 100644 --- a/third_party/libaom/source/libaom/examples/svc_encoder_rtc.c +++ b/third_party/libaom/source/libaom/examples/svc_encoder_rtc.c @@ -24,6 +24,7 @@ #include "common/args.h" #include "common/tools_common.h" #include "common/video_writer.h" +#include "examples/encoder_util.h" #include "aom_ports/aom_timer.h" #define OPTION_BUFFER_SIZE 1024 @@ -286,6 +287,9 @@ static void parse_command_line(int argc, const char **argv_, if (app_input->speed > 9) { warn("Mapping speed %d to speed 9.\n", app_input->speed); } + if (app_input->speed <= 6) { + die("Encoder speed setting should be in [7, 9].\n"); + } } else if (arg_match(&arg, &aqmode_arg, argi)) { app_input->aq_mode = arg_parse_uint(&arg); } else if (arg_match(&arg, &threads_arg, argi)) { @@ -567,7 +571,7 @@ static void set_layer_pattern(int layering_mode, int superframe_cnt, layer_id->spatial_layer_id = spatial_layer_id; int lag_index = 0; int base_count = superframe_cnt >> 2; - // Set the referende map buffer idx for the 7 references: + // Set the reference map buffer idx for the 7 references: // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i; @@ -795,12 +799,10 @@ static void set_layer_pattern(int layering_mode, int superframe_cnt, } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1, // GOLDEN (and all other refs) to slot 3. - // Set LAST2 to slot 4 and Update slot 4. + // No update. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 3; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 1; - ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4; - ref_frame_config->refresh[4] = 1; } } else if ((superframe_cnt - 2) % 4 == 0) { // Middle temporal enhancement layer. @@ -837,13 +839,11 @@ static void set_layer_pattern(int layering_mode, int superframe_cnt, ref_frame_config->refresh[3] = 1; } else if (layer_id->spatial_layer_id == 1) { // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6, - // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4. + // GOLDEN to slot 3. No update. for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = 0; ref_frame_config->ref_idx[SVC_LAST_FRAME] = 6 - shift; ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3; - ref_frame_config->ref_idx[SVC_LAST2_FRAME] = 4; - ref_frame_config->refresh[4] = 1; } } if (layer_id->spatial_layer_id > 0 && !ksvc_mode) { @@ -998,6 +998,64 @@ static void set_layer_pattern(int layering_mode, int superframe_cnt, } } +#if CONFIG_AV1_DECODER +static void test_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder, + const int frames_out, int *mismatch_seen) { + aom_image_t enc_img, dec_img; + + if (*mismatch_seen) return; + + /* Get the internal reference frame */ + AOM_CODEC_CONTROL_TYPECHECKED(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img); + AOM_CODEC_CONTROL_TYPECHECKED(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img); + +#if CONFIG_AV1_HIGHBITDEPTH + if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t enc_hbd_img; + aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img); + enc_img = enc_hbd_img; + } + if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_image_t dec_hbd_img; + aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img); + dec_img = dec_hbd_img; + } + } +#endif + + if (!aom_compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; +#if CONFIG_AV1_HIGHBITDEPTH + if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) { + aom_find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + aom_find_mismatch(&enc_img, &dec_img, y, u, v); + } +#else + aom_find_mismatch(&enc_img, &dec_img, y, u, v); +#endif + decoder->err = 1; + printf( + "Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}", + frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], + v[2], v[3]); + *mismatch_seen = frames_out; + } + + aom_img_free(&enc_img); + aom_img_free(&dec_img); +} +#endif // CONFIG_AV1_DECODER + int main(int argc, const char **argv) { AppInput app_input; AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL }; @@ -1017,6 +1075,17 @@ int main(int argc, const char **argv) { aom_svc_params_t svc_params; aom_svc_ref_frame_config_t ref_frame_config; +#if CONFIG_INTERNAL_STATS + FILE *stats_file = fopen("opsnr.stt", "a"); + if (stats_file == NULL) { + die("Cannot open opsnr.stt\n"); + } +#endif +#if CONFIG_AV1_DECODER + int mismatch_seen = 0; + aom_codec_ctx_t decoder; +#endif + struct RateControlMetrics rc; int64_t cx_time = 0; int64_t cx_time_sl[3]; // max number of spatial layers. @@ -1039,11 +1108,12 @@ int main(int argc, const char **argv) { app_input.input_ctx.framerate.denominator = 1; app_input.input_ctx.only_i420 = 1; app_input.input_ctx.bit_depth = 0; + app_input.speed = 7; exec_name = argv[0]; // start with default encoder configuration - aom_codec_err_t res = - aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, 0); + aom_codec_err_t res = aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, + AOM_USAGE_REALTIME); if (res) { die("Failed to get config: %s\n", aom_codec_err_to_string(res)); } @@ -1071,10 +1141,13 @@ int main(int argc, const char **argv) { unsigned int width = cfg.g_w; unsigned int height = cfg.g_h; - if (ts_number_layers != - mode_to_num_temporal_layers[app_input.layering_mode] || - ss_number_layers != mode_to_num_spatial_layers[app_input.layering_mode]) { - die("Number of layers doesn't match layering mode."); + if (app_input.layering_mode >= 0) { + if (ts_number_layers != + mode_to_num_temporal_layers[app_input.layering_mode] || + ss_number_layers != + mode_to_num_spatial_layers[app_input.layering_mode]) { + die("Number of layers doesn't match layering mode."); + } } // Y4M reader has its own allocation. @@ -1109,20 +1182,16 @@ int main(int argc, const char **argv) { svc_params.framerate_factor[2] = 1; } - framerate = cfg.g_timebase.den / cfg.g_timebase.num; - set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers); - if (app_input.input_ctx.file_type == FILE_TYPE_Y4M) { - if (app_input.input_ctx.width != cfg.g_w || - app_input.input_ctx.height != cfg.g_h) { - die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h); - } - if (app_input.input_ctx.framerate.numerator != cfg.g_timebase.den || - app_input.input_ctx.framerate.denominator != cfg.g_timebase.num) { - die("Incorrect framerate: numerator %d denominator %d", - cfg.g_timebase.num, cfg.g_timebase.den); - } + // Override these settings with the info from Y4M file. + cfg.g_w = app_input.input_ctx.width; + cfg.g_h = app_input.input_ctx.height; + // g_timebase is the reciprocal of frame rate. + cfg.g_timebase.num = app_input.input_ctx.framerate.denominator; + cfg.g_timebase.den = app_input.input_ctx.framerate.numerator; } + framerate = cfg.g_timebase.den / cfg.g_timebase.num; + set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers); AvxVideoInfo info; info.codec_fourcc = get_fourcc_by_aom_encoder(encoder); @@ -1162,6 +1231,12 @@ int main(int argc, const char **argv) { if (aom_codec_enc_init(&codec, encoder, &cfg, 0)) die("Failed to initialize encoder"); +#if CONFIG_AV1_DECODER + if (aom_codec_dec_init(&decoder, get_aom_decoder_by_index(0), NULL, 0)) { + die("Failed to initialize decoder"); + } +#endif + aom_codec_control(&codec, AOME_SET_CPUUSED, app_input.speed); aom_codec_control(&codec, AV1E_SET_AQ_MODE, app_input.aq_mode ? 3 : 0); aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0); @@ -1172,6 +1247,7 @@ int main(int argc, const char **argv) { aom_codec_control(&codec, AV1E_SET_COEFF_COST_UPD_FREQ, 3); aom_codec_control(&codec, AV1E_SET_MODE_COST_UPD_FREQ, 3); aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3); + aom_codec_control(&codec, AV1E_SET_DV_COST_UPD_FREQ, 3); aom_codec_control(&codec, AV1E_SET_CDF_UPDATE_MODE, 1); aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, cfg.g_threads ? get_msb(cfg.g_threads) : 0); @@ -1196,8 +1272,8 @@ int main(int argc, const char **argv) { svc_params.scaling_factor_num[1] = 1; svc_params.scaling_factor_den[1] = 2; } - aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params); + // TODO(aomedia:3032): Configure KSVC in fixed mode. // This controls the maximum target size of the key frame. // For generating smaller key frames, use a smaller max_intra_size_pct @@ -1220,15 +1296,34 @@ int main(int argc, const char **argv) { const aom_codec_cx_pkt_t *pkt; int layer = 0; - // Set the reference/update flags, layer_id, and reference_map - // buffer index. - set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id, - &ref_frame_config, &use_svc_control, slx, is_key_frame, - (app_input.layering_mode == 10)); - aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id); - if (use_svc_control) - aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG, - &ref_frame_config); + // For flexible mode: + if (app_input.layering_mode >= 0) { + // Set the reference/update flags, layer_id, and reference_map + // buffer index. + set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id, + &ref_frame_config, &use_svc_control, slx, + is_key_frame, (app_input.layering_mode == 10)); + aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id); + if (use_svc_control) + aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG, + &ref_frame_config); + } else { + // Only up to 3 temporal layers supported in fixed mode. + // Only need to set spatial and temporal layer_id: reference + // prediction, refresh, and buffer_idx are set internally. + layer_id.spatial_layer_id = slx; + layer_id.temporal_layer_id = 0; + if (ts_number_layers == 2) { + layer_id.temporal_layer_id = (frame_cnt % 2) != 0; + } else if (ts_number_layers == 3) { + if (frame_cnt % 2 != 0) + layer_id.temporal_layer_id = 2; + else if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) + layer_id.temporal_layer_id = 1; + } + aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id); + } + if (set_err_resil_frame) { // Set error_resilient per frame: off/0 for base layer and // on/1 for enhancement layer frames. @@ -1332,14 +1427,31 @@ int main(int argc, const char **argv) { sum_bitrate2 = 0.0; } } + +#if CONFIG_AV1_DECODER + if (aom_codec_decode(&decoder, pkt->data.frame.buf, + (unsigned int)pkt->data.frame.sz, NULL)) + die_codec(&decoder, "Failed to decode frame."); +#endif + break; default: break; } } +#if CONFIG_AV1_DECODER + // Don't look for mismatch on top spatial and top temporal layers as they + // are non reference frames. + if ((ss_number_layers > 1 || ts_number_layers > 1) && + !(layer_id.temporal_layer_id > 0 && + layer_id.temporal_layer_id == (int)ts_number_layers - 1)) { + test_decode(&codec, &decoder, frame_cnt, &mismatch_seen); + } +#endif } // loop over spatial layers ++frame_cnt; pts += frame_duration; } + close_input_file(&(app_input.input_ctx)); printout_rate_control_summary(&rc, frame_cnt, ss_number_layers, ts_number_layers); @@ -1358,6 +1470,15 @@ int main(int argc, const char **argv) { if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); +#if CONFIG_INTERNAL_STATS + if (mismatch_seen) { + fprintf(stats_file, "First mismatch occurred in frame %d\n", mismatch_seen); + } else { + fprintf(stats_file, "No mismatch detected in recon buffers\n"); + } + fclose(stats_file); +#endif + // Try to rewrite the output file headers with the actual frame count. for (i = 0; i < ss_number_layers * ts_number_layers; ++i) aom_video_writer_close(outfile[i]); diff --git a/third_party/libaom/source/libaom/test/active_map_test.cc b/third_party/libaom/source/libaom/test/active_map_test.cc index 4e30f55f81..2bbc3b64fb 100644 --- a/third_party/libaom/source/libaom/test/active_map_test.cc +++ b/third_party/libaom/source/libaom/test/active_map_test.cc @@ -38,6 +38,9 @@ class ActiveMapTest ::libaom_test::Encoder *encoder) { if (video->frame() == 0) { encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0); + encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0); + encoder->Control(AV1E_SET_ENABLE_OBMC, 0); } else if (video->frame() == 3) { aom_active_map_t map = aom_active_map_t(); /* clang-format off */ @@ -87,14 +90,6 @@ class ActiveMapTest TEST_P(ActiveMapTest, Test) { DoTest(); } -class ActiveMapTestLarge : public ActiveMapTest {}; - -TEST_P(ActiveMapTestLarge, Test) { DoTest(); } - -AV1_INSTANTIATE_TEST_SUITE(ActiveMapTestLarge, - ::testing::Values(::libaom_test::kRealTime), - ::testing::Range(0, 5)); - AV1_INSTANTIATE_TEST_SUITE(ActiveMapTest, ::testing::Values(::libaom_test::kRealTime), ::testing::Range(5, 9)); diff --git a/third_party/libaom/source/libaom/test/altref_test.cc b/third_party/libaom/source/libaom/test/altref_test.cc index 1334b4af57..002a206967 100644 --- a/third_party/libaom/source/libaom/test/altref_test.cc +++ b/third_party/libaom/source/libaom/test/altref_test.cc @@ -133,9 +133,7 @@ const gfIntervalParam gfTestParams[] = { { ::libaom_test::kTwoPassGood, 5, 10 }, { ::libaom_test::kTwoPassGood, 8, 16 }, { ::libaom_test::kTwoPassGood, 16, 32 }, - // disabled below test case because it causes failure - // TODO(anyone): enable below test case once issue is fixed. - // { ::libaom_test::kTwoPassGood, 20, 32 }, + { ::libaom_test::kTwoPassGood, 20, 32 }, }; // This class is used to test if the gf interval bounds configured by the user diff --git a/third_party/libaom/source/libaom/test/aom_image_test.cc b/third_party/libaom/source/libaom/test/aom_image_test.cc new file mode 100644 index 0000000000..7ff82d7273 --- /dev/null +++ b/third_party/libaom/source/libaom/test/aom_image_test.cc @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_image.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +TEST(AomImageTest, AomImgWrapInvalidAlign) { + const int kWidth = 128; + const int kHeight = 128; + unsigned char buf[kWidth * kHeight * 3]; + + aom_image_t img; + // Set img_data and img_data_owner to junk values. aom_img_wrap() should + // not read these values on failure. + img.img_data = (unsigned char *)""; + img.img_data_owner = 1; + + aom_img_fmt_t format = AOM_IMG_FMT_I444; + // 'align' must be a power of 2 but is not. This causes the aom_img_wrap() + // call to fail. The test verifies we do not read the junk values in 'img'. + unsigned int align = 31; + EXPECT_EQ(aom_img_wrap(&img, format, kWidth, kHeight, align, buf), nullptr); +} diff --git a/third_party/libaom/source/libaom/test/aq_segment_test.cc b/third_party/libaom/source/libaom/test/aq_segment_test.cc index 4e52b55dbe..b4a8b612bf 100644 --- a/third_party/libaom/source/libaom/test/aq_segment_test.cc +++ b/third_party/libaom/source/libaom/test/aq_segment_test.cc @@ -19,6 +19,13 @@ namespace { +const libaom_test::TestMode kTestModeParams[] = +#if CONFIG_REALTIME_ONLY + { ::libaom_test::kRealTime }; +#else + { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood }; +#endif + class AqSegmentTest : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int, int>, @@ -40,6 +47,11 @@ class AqSegmentTest encoder->Control(AV1E_SET_AQ_MODE, aq_mode_); encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_); encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100); + if (mode_ == ::libaom_test::kRealTime) { + encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0); + encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0); + encoder->Control(AV1E_SET_ENABLE_OBMC, 0); + } } } @@ -69,10 +81,7 @@ class AqSegmentTest // 3-cyclic_refresh_aq) encodes and decodes without a mismatch. TEST_P(AqSegmentTest, TestNoMisMatch) { DoTest(GET_PARAM(3)); } -class AqSegmentTestLarge : public AqSegmentTest {}; - -TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); } - +#if !CONFIG_REALTIME_ONLY // Validate that this delta q mode // encodes and decodes without a mismatch. TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) { @@ -84,13 +93,18 @@ TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } +#endif -AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest, - ::testing::Values(::libaom_test::kRealTime, - ::libaom_test::kOnePassGood), +AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest, ::testing::ValuesIn(kTestModeParams), ::testing::Range(5, 9), ::testing::Range(0, 4)); + +#if !CONFIG_REALTIME_ONLY +class AqSegmentTestLarge : public AqSegmentTest {}; + +TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); } + AV1_INSTANTIATE_TEST_SUITE(AqSegmentTestLarge, - ::testing::Values(::libaom_test::kRealTime, - ::libaom_test::kOnePassGood), + ::testing::Values(::libaom_test::kOnePassGood), ::testing::Range(3, 5), ::testing::Range(0, 4)); +#endif } // namespace diff --git a/third_party/libaom/source/libaom/test/arf_freq_test.cc b/third_party/libaom/source/libaom/test/arf_freq_test.cc index 0bf47e6ec4..d12f5ccee6 100644 --- a/third_party/libaom/source/libaom/test/arf_freq_test.cc +++ b/third_party/libaom/source/libaom/test/arf_freq_test.cc @@ -56,9 +56,13 @@ const TestVideoParam kTestVectors[] = { }; const TestEncodeParam kEncodeVectors[] = { - { ::libaom_test::kOnePassGood, 2 }, { ::libaom_test::kOnePassGood, 5 }, - { ::libaom_test::kTwoPassGood, 1 }, { ::libaom_test::kTwoPassGood, 2 }, - { ::libaom_test::kTwoPassGood, 5 }, { ::libaom_test::kRealTime, 5 }, +#if CONFIG_REALTIME_ONLY + { ::libaom_test::kRealTime, 5 }, +#else + { ::libaom_test::kRealTime, 5 }, { ::libaom_test::kOnePassGood, 2 }, + { ::libaom_test::kOnePassGood, 5 }, { ::libaom_test::kTwoPassGood, 1 }, + { ::libaom_test::kTwoPassGood, 2 }, { ::libaom_test::kTwoPassGood, 5 }, +#endif }; const int kMinArfVectors[] = { diff --git a/third_party/libaom/source/libaom/test/av1_convolve_scale_test.cc b/third_party/libaom/source/libaom/test/av1_convolve_scale_test.cc index a1c5746637..65300140ba 100644 --- a/third_party/libaom/source/libaom/test/av1_convolve_scale_test.cc +++ b/third_party/libaom/source/libaom/test/av1_convolve_scale_test.cc @@ -293,8 +293,8 @@ class ConvolveScaleTestBase : public ::testing::Test { convolve_params_.do_average = do_average; } else { convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg; - convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0]; - convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1]; + convolve_params_.fwd_offset = quant_dist_lookup_table[j][i]; + convolve_params_.bck_offset = quant_dist_lookup_table[j][1 - i]; convolve_params_.is_compound = is_compound; convolve_params_.do_average = do_average; } diff --git a/third_party/libaom/source/libaom/test/av1_convolve_test.cc b/third_party/libaom/source/libaom/test/av1_convolve_test.cc index 0c902808ad..4d61f02298 100644 --- a/third_party/libaom/source/libaom/test/av1_convolve_test.cc +++ b/third_party/libaom/source/libaom/test/av1_convolve_test.cc @@ -1172,8 +1172,8 @@ std::vector<CompoundParam> GetCompoundParams() { result.push_back(CompoundParam(false, 0, 0)); for (int k = 0; k < 2; ++k) { for (int l = 0; l < 4; ++l) { - result.push_back(CompoundParam(true, quant_dist_lookup_table[k][l][0], - quant_dist_lookup_table[k][l][1])); + result.push_back(CompoundParam(true, quant_dist_lookup_table[l][k], + quant_dist_lookup_table[l][1 - k])); } } return result; diff --git a/third_party/libaom/source/libaom/test/av1_external_partition_test.cc b/third_party/libaom/source/libaom/test/av1_external_partition_test.cc new file mode 100644 index 0000000000..4fe61c7843 --- /dev/null +++ b/third_party/libaom/source/libaom/test/av1_external_partition_test.cc @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <fstream> +#include <new> +#include <sstream> +#include <string> + +#include "aom/aom_codec.h" +#include "aom/aom_external_partition.h" +#include "av1/common/blockd.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/y4m_video_source.h" +#include "test/util.h" + +#if CONFIG_AV1_ENCODER +#if !CONFIG_REALTIME_ONLY +namespace { + +constexpr int kFrameNum = 8; +constexpr int kVersion = 1; + +typedef struct TestData { + int version = kVersion; +} TestData; + +typedef struct ToyModel { + TestData *data; + aom_ext_part_config_t config; + aom_ext_part_funcs_t funcs; +} ToyModel; + +// Feature files written during encoding, as defined in partition_strategy.c. +std::string feature_file_names[] = { + "feature_before_partition_none", + "feature_before_partition_none_prune_rect", + "feature_after_partition_none_prune", + "feature_after_partition_none_terminate", + "feature_after_partition_split_terminate", + "feature_after_partition_split_prune_rect", + "feature_after_partition_rect", + "feature_after_partition_ab", +}; + +// Files written here in the test, where the feature data is received +// from the API. +std::string test_feature_file_names[] = { + "test_feature_before_partition_none", + "test_feature_before_partition_none_prune_rect", + "test_feature_after_partition_none_prune", + "test_feature_after_partition_none_terminate", + "test_feature_after_partition_split_terminate", + "test_feature_after_partition_split_prune_rect", + "test_feature_after_partition_rect", + "test_feature_after_partition_ab", +}; + +static void write_features_to_file(const float *features, + const int feature_size, const int id) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s", + test_feature_file_names[id].c_str()); + FILE *pfile = fopen(filename, "a"); + for (int i = 0; i < feature_size; ++i) { + fprintf(pfile, "%.6f", features[i]); + if (i < feature_size - 1) fprintf(pfile, ","); + } + fprintf(pfile, "\n"); + fclose(pfile); +} + +aom_ext_part_status_t ext_part_create_model( + void *priv, const aom_ext_part_config_t *part_config, + aom_ext_part_model_t *ext_part_model) { + TestData *received_data = reinterpret_cast<TestData *>(priv); + EXPECT_EQ(received_data->version, kVersion); + ToyModel *toy_model = new (std::nothrow) ToyModel; + EXPECT_NE(toy_model, nullptr); + toy_model->data = received_data; + *ext_part_model = toy_model; + EXPECT_EQ(part_config->superblock_size, BLOCK_64X64); + return AOM_EXT_PART_OK; +} + +aom_ext_part_status_t ext_part_create_model_test( + void *priv, const aom_ext_part_config_t *part_config, + aom_ext_part_model_t *ext_part_model) { + (void)priv; + (void)ext_part_model; + EXPECT_EQ(part_config->superblock_size, BLOCK_64X64); + return AOM_EXT_PART_TEST; +} + +aom_ext_part_status_t ext_part_send_features( + aom_ext_part_model_t ext_part_model, + const aom_partition_features_t *part_features) { + (void)ext_part_model; + (void)part_features; + return AOM_EXT_PART_OK; +} + +aom_ext_part_status_t ext_part_send_features_test( + aom_ext_part_model_t ext_part_model, + const aom_partition_features_t *part_features) { + (void)ext_part_model; + if (part_features->id == FEATURE_BEFORE_PART_NONE) { + write_features_to_file(part_features->before_part_none.f, SIZE_DIRECT_SPLIT, + 0); + } else if (part_features->id == FEATURE_BEFORE_PART_NONE_PART2) { + write_features_to_file(part_features->before_part_none.f_part2, + SIZE_PRUNE_PART, 1); + } else if (part_features->id == FEATURE_AFTER_PART_NONE) { + write_features_to_file(part_features->after_part_none.f, SIZE_PRUNE_NONE, + 2); + } else if (part_features->id == FEATURE_AFTER_PART_NONE_PART2) { + write_features_to_file(part_features->after_part_none.f_terminate, + SIZE_TERM_NONE, 3); + } else if (part_features->id == FEATURE_AFTER_PART_SPLIT) { + write_features_to_file(part_features->after_part_split.f_terminate, + SIZE_TERM_SPLIT, 4); + } else if (part_features->id == FEATURE_AFTER_PART_SPLIT_PART2) { + write_features_to_file(part_features->after_part_split.f_prune_rect, + SIZE_PRUNE_RECT, 5); + } else if (part_features->id == FEATURE_AFTER_PART_RECT) { + write_features_to_file(part_features->after_part_rect.f, SIZE_PRUNE_AB, 6); + } else if (part_features->id == FEATURE_AFTER_PART_AB) { + write_features_to_file(part_features->after_part_ab.f, SIZE_PRUNE_4_WAY, 7); + } + return AOM_EXT_PART_TEST; +} + +aom_ext_part_status_t ext_part_get_partition_decision( + aom_ext_part_model_t ext_part_model, + aom_partition_decision_t *ext_part_decision) { + (void)ext_part_model; + (void)ext_part_decision; + return AOM_EXT_PART_ERROR; +} + +aom_ext_part_status_t ext_part_send_partition_stats( + aom_ext_part_model_t ext_part_model, + const aom_partition_stats_t *ext_part_stats) { + (void)ext_part_model; + (void)ext_part_stats; + return AOM_EXT_PART_OK; +} + +aom_ext_part_status_t ext_part_delete_model( + aom_ext_part_model_t ext_part_model) { + ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model); + EXPECT_EQ(toy_model->data->version, kVersion); + delete toy_model; + return AOM_EXT_PART_OK; +} + +class ExternalPartitionTest + : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>, + public ::libaom_test::EncoderTest { + protected: + ExternalPartitionTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {} + virtual ~ExternalPartitionTest() {} + + virtual void SetUp() { + InitializeConfig(encoding_mode_); + const aom_rational timebase = { 1, 30 }; + cfg_.g_timebase = timebase; + cfg_.rc_end_usage = AOM_VBR; + cfg_.g_threads = 1; + cfg_.g_lag_in_frames = 4; + cfg_.rc_target_bitrate = 400; + init_flags_ = AOM_CODEC_USE_PSNR; + } + + virtual bool DoDecode() const { return false; } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + void SetExternalPartition(bool use_external_partition) { + use_external_partition_ = use_external_partition; + } + + void SetTestSendFeatures(int test_send_features) { + test_send_features_ = test_send_features; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + aom_ext_part_funcs_t ext_part_funcs; + ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_); + if (use_external_partition_) { + ext_part_funcs.create_model = ext_part_create_model; + ext_part_funcs.send_features = ext_part_send_features; + } + if (test_send_features_ == 1) { + ext_part_funcs.create_model = ext_part_create_model; + ext_part_funcs.send_features = ext_part_send_features_test; + } else if (test_send_features_ == 0) { + ext_part_funcs.create_model = ext_part_create_model_test; + ext_part_funcs.send_features = ext_part_send_features; + } + ext_part_funcs.get_partition_decision = ext_part_get_partition_decision; + ext_part_funcs.send_partition_stats = ext_part_send_partition_stats; + ext_part_funcs.delete_model = ext_part_delete_model; + + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + if (use_external_partition_) { + encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs); + } + } + } + + private: + libaom_test::TestMode encoding_mode_; + int cpu_used_; + double psnr_; + unsigned int nframes_; + bool use_external_partition_ = false; + int test_send_features_ = -1; + TestData test_data_; +}; + +// Encode twice and expect the same psnr value. +// The first run is the baseline without external partition. +// The second run is to get partition decisions from the toy model we defined. +// Here, we let the partition decision return true for all stages. +// In this case, the external partition doesn't alter the original encoder +// behavior. So we expect the same encoding results. +TEST_P(ExternalPartitionTest, EncodeMatch) { + ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum); + SetExternalPartition(false); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr = GetAveragePsnr(); + + SetExternalPartition(true); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr2 = GetAveragePsnr(); + + EXPECT_DOUBLE_EQ(psnr, psnr2); +} + +// Encode twice to compare generated feature files. +// The first run let the encoder write partition features to file. +// The second run calls send partition features function to send features to +// the external model, and we write them to file. +// The generated files should match each other. +TEST_P(ExternalPartitionTest, SendFeatures) { + ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum); + SetExternalPartition(true); + SetTestSendFeatures(0); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + SetExternalPartition(true); + SetTestSendFeatures(1); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare feature files by reading them into strings. + for (int i = 0; i < 8; ++i) { + std::ifstream base_file(feature_file_names[i]); + std::stringstream base_stream; + base_stream << base_file.rdbuf(); + std::string base_string = base_stream.str(); + + std::ifstream test_file(test_feature_file_names[i]); + std::stringstream test_stream; + test_stream << test_file.rdbuf(); + std::string test_string = test_stream.str(); + + EXPECT_STREQ(base_string.c_str(), test_string.c_str()); + } + + // Remove files. + std::string command("rm -f feature_* test_feature_*"); + system(command.c_str()); +} + +AV1_INSTANTIATE_TEST_SUITE(ExternalPartitionTest, + ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::Values(4)); // cpu_used + +} // namespace +#endif // !CONFIG_REALTIME_ONLY +#endif // CONFIG_AV1_ENCODER diff --git a/third_party/libaom/source/libaom/test/av1_fwd_txfm2d_test.cc b/third_party/libaom/source/libaom/test/av1_fwd_txfm2d_test.cc index 0e7eb09f2a..d124330ff8 100644 --- a/third_party/libaom/source/libaom/test/av1_fwd_txfm2d_test.cc +++ b/third_party/libaom/source/libaom/test/av1_fwd_txfm2d_test.cc @@ -362,6 +362,78 @@ TEST_P(AV1FwdTxfm2dTest, match) { TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) { AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1)); } +TEST(AV1FwdTxfm2dTest, DCTScaleTest) { + BitDepthInfo bd_info; + bd_info.bit_depth = 8; + bd_info.use_highbitdepth_buf = 0; + DECLARE_ALIGNED(32, int16_t, src_diff[1024]); + DECLARE_ALIGNED(32, tran_low_t, coeff[1024]); + + const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 }; + const int stride_list[4] = { 4, 8, 16, 32 }; + const int ref_scale_list[4] = { 64, 64, 64, 16 }; + + for (int i = 0; i < 4; i++) { + TX_SIZE tx_size = tx_size_list[i]; + int stride = stride_list[i]; + int array_size = stride * stride; + + for (int i = 0; i < array_size; i++) { + src_diff[i] = 8; + coeff[i] = 0; + } + + av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, stride, + coeff); + + double input_sse = 0; + double output_sse = 0; + for (int i = 0; i < array_size; i++) { + input_sse += pow(src_diff[i], 2); + output_sse += pow(coeff[i], 2); + } + + double scale = output_sse / input_sse; + + EXPECT_NEAR(scale, ref_scale_list[i], 5); + } +} +TEST(AV1FwdTxfm2dTest, HadamardScaleTest) { + BitDepthInfo bd_info; + bd_info.bit_depth = 8; + bd_info.use_highbitdepth_buf = 0; + DECLARE_ALIGNED(32, int16_t, src_diff[1024]); + DECLARE_ALIGNED(32, tran_low_t, coeff[1024]); + + const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 }; + const int stride_list[4] = { 4, 8, 16, 32 }; + const int ref_scale_list[4] = { 1, 64, 64, 16 }; + + for (int i = 0; i < 4; i++) { + TX_SIZE tx_size = tx_size_list[i]; + int stride = stride_list[i]; + int array_size = stride * stride; + + for (int i = 0; i < array_size; i++) { + src_diff[i] = 8; + coeff[i] = 0; + } + + av1_quick_txfm(/*use_hadamard=*/1, tx_size, bd_info, src_diff, stride, + coeff); + + double input_sse = 0; + double output_sse = 0; + for (int i = 0; i < array_size; i++) { + input_sse += pow(src_diff[i], 2); + output_sse += pow(coeff[i], 2); + } + + double scale = output_sse / input_sse; + + EXPECT_NEAR(scale, ref_scale_list[i], 5); + } +} using ::testing::Combine; using ::testing::Values; using ::testing::ValuesIn; @@ -580,8 +652,10 @@ using ::testing::ValuesIn; #if HAVE_SSE4_1 static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, - TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, - TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, + TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, +#if !CONFIG_REALTIME_ONLY + TX_4X16, TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, +#endif }; INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdFwdTxfm2dTest, diff --git a/third_party/libaom/source/libaom/test/av1_highbd_iht_test.cc b/third_party/libaom/source/libaom/test/av1_highbd_iht_test.cc index a576c0ffed..165abc9483 100644 --- a/third_party/libaom/source/libaom/test/av1_highbd_iht_test.cc +++ b/third_party/libaom/source/libaom/test/av1_highbd_iht_test.cc @@ -210,6 +210,12 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1HighbdInvTxfm2d); void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_, int run_times, int bit_depth_, int gt_int16) { +#if CONFIG_REALTIME_ONLY + if (tx_size_ == TX_4X16 || tx_size_ == TX_16X4 || tx_size_ == TX_8X32 || + tx_size_ == TX_32X8 || tx_size_ == TX_16X64 || tx_size_ == TX_64X16) { + return; + } +#endif FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_]; TxfmParam txfm_param; const int BLK_WIDTH = 64; diff --git a/third_party/libaom/source/libaom/test/av1_key_value_api_test.cc b/third_party/libaom/source/libaom/test/av1_key_value_api_test.cc index 3d06d2d6c5..058b8ce443 100644 --- a/third_party/libaom/source/libaom/test/av1_key_value_api_test.cc +++ b/third_party/libaom/source/libaom/test/av1_key_value_api_test.cc @@ -29,10 +29,15 @@ class BaseKeyValAPI : public testing::Test { #if CONFIG_AV1_ENCODER aom_codec_iface_t *iface_cx = aom_codec_av1_cx(); aom_codec_enc_cfg_t enc_cfg; - +#if CONFIG_REALTIME_ONLY + const int usage = 1; +#else + const int usage = 0; +#endif + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_enc_config_default(iface_cx, &enc_cfg, usage)); EXPECT_EQ(AOM_CODEC_OK, - aom_codec_enc_config_default(iface_cx, &enc_cfg, 0)); - EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc_, iface_cx, &enc_cfg, 0)); + aom_codec_enc_init(&enc_, iface_cx, &enc_cfg, usage)); #endif #if CONFIG_AV1_DECODER aom_codec_iface_t *iface_dx = aom_codec_av1_dx(); diff --git a/third_party/libaom/source/libaom/test/av1_quantize_test.cc b/third_party/libaom/source/libaom/test/av1_quantize_test.cc index f0882c7099..bfb684effd 100644 --- a/third_party/libaom/source/libaom/test/av1_quantize_test.cc +++ b/third_party/libaom/source/libaom/test/av1_quantize_test.cc @@ -19,6 +19,7 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "av1/common/scan.h" +#include "av1/encoder/av1_quantize.h" namespace { @@ -207,6 +208,32 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(AV1QuantizeTest); TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); } TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); } +TEST(AV1QuantizeTest, QuantizeFpNoQmatrix) { + // Here we use a uniform quantizer as an example + const int16_t dequant_ptr[2] = { 78, 93 }; // quantize step + const int16_t round_ptr[2] = { 39, 46 }; // round ~= dequant / 2 + + // quant ~= 2^16 / dequant. This is a 16-bit fixed point representation of the + // inverse of quantize step. + const int16_t quant_ptr[2] = { 840, 704 }; + int log_scale = 0; + int coeff_count = 4; + const tran_low_t coeff_ptr[4] = { -449, 624, -14, 24 }; + const tran_low_t ref_qcoeff_ptr[4] = { -6, 7, 0, 0 }; + const tran_low_t ref_dqcoeff_ptr[4] = { -468, 651, 0, 0 }; + const int16_t scan[4] = { 0, 1, 2, 3 }; + tran_low_t qcoeff_ptr[4]; + tran_low_t dqcoeff_ptr[4]; + int eob = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr, + log_scale, scan, coeff_count, coeff_ptr, + qcoeff_ptr, dqcoeff_ptr); + EXPECT_EQ(eob, 2); + for (int i = 0; i < coeff_count; ++i) { + EXPECT_EQ(qcoeff_ptr[i], ref_qcoeff_ptr[i]); + EXPECT_EQ(dqcoeff_ptr[i], ref_dqcoeff_ptr[i]); + } +} + #if HAVE_SSE4_1 const QuantizeFuncParams qfps[4] = { QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c, diff --git a/third_party/libaom/source/libaom/test/block_test.cc b/third_party/libaom/source/libaom/test/block_test.cc index 9cf5b020ef..74deee3f54 100644 --- a/third_party/libaom/source/libaom/test/block_test.cc +++ b/third_party/libaom/source/libaom/test/block_test.cc @@ -191,9 +191,17 @@ TEST_P(SuperBlockSizeTestLarge, SuperBlockSizeTest) { << "Failed for SB size " << superblock_size_; } +const ::libaom_test::TestMode kTestModes[] = { +#if CONFIG_REALTIME_ONLY + ::libaom_test::kRealTime +#else + ::libaom_test::kRealTime, ::libaom_test::kOnePassGood, + ::libaom_test::kTwoPassGood +#endif +}; + AV1_INSTANTIATE_TEST_SUITE(SuperBlockSizeTestLarge, - ::testing::Values(::libaom_test::kOnePassGood, - ::libaom_test::kTwoPassGood), + ::testing::ValuesIn(kTestModes), ::testing::Values(AOM_SUPERBLOCK_SIZE_64X64, AOM_SUPERBLOCK_SIZE_128X128), ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ)); diff --git a/third_party/libaom/source/libaom/test/coding_path_sync.cc b/third_party/libaom/source/libaom/test/coding_path_sync.cc index 4c613dc03b..0eaa9dad8d 100644 --- a/third_party/libaom/source/libaom/test/coding_path_sync.cc +++ b/third_party/libaom/source/libaom/test/coding_path_sync.cc @@ -31,7 +31,11 @@ class CompressedSource { aom_codec_iface_t *algo = aom_codec_av1_cx(); aom_codec_enc_cfg_t cfg; +#if CONFIG_REALTIME_ONLY + aom_codec_enc_config_default(algo, &cfg, 1); +#else aom_codec_enc_config_default(algo, &cfg, 0); +#endif // force the quantizer, to reduce the sensitivity on encoding choices. // e.g, we don't want this test to break when the rate control is modified. diff --git a/third_party/libaom/source/libaom/test/comp_avg_pred_test.h b/third_party/libaom/source/libaom/test/comp_avg_pred_test.h index 7f73312c4e..f2fee6d434 100644 --- a/third_party/libaom/source/libaom/test/comp_avg_pred_test.h +++ b/third_party/libaom/source/libaom/test/comp_avg_pred_test.h @@ -117,8 +117,8 @@ class AV1DISTWTDCOMPAVGTest for (int ii = 0; ii < 2; ii++) { for (int jj = 0; jj < 4; jj++) { - dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii]; const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); @@ -160,8 +160,8 @@ class AV1DISTWTDCOMPAVGTest DIST_WTD_COMP_PARAMS dist_wtd_comp_params; dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; - dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; - dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1]; const int num_loops = 1000000000 / (in_w + in_h); aom_usec_timer timer; @@ -226,10 +226,9 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) { for (int ii = 0; ii < 2; ii++) { for (int jj = 0; jj < 4; jj++) { - dist_wtd_comp_params.fwd_offset = - quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii]; dist_wtd_comp_params.bck_offset = - quant_dist_lookup_table[ii][jj][1]; + quant_dist_lookup_table[jj][1 - ii]; const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); @@ -282,8 +281,8 @@ class AV1DISTWTDCOMPAVGUPSAMPLEDTest DIST_WTD_COMP_PARAMS dist_wtd_comp_params; dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; - dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; - dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1]; int sub_x_q3 = 0; int sub_y_q3 = 0; @@ -351,8 +350,8 @@ class AV1HighBDDISTWTDCOMPAVGTest for (int ii = 0; ii < 2; ii++) { for (int jj = 0; jj < 4; jj++) { - dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii]; const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); @@ -398,8 +397,8 @@ class AV1HighBDDISTWTDCOMPAVGTest DIST_WTD_COMP_PARAMS dist_wtd_comp_params; dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; - dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; - dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1]; const int num_loops = 1000000000 / (in_w + in_h); aom_usec_timer timer; @@ -466,10 +465,9 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) { for (int ii = 0; ii < 2; ii++) { for (int jj = 0; jj < 4; jj++) { - dist_wtd_comp_params.fwd_offset = - quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii]; dist_wtd_comp_params.bck_offset = - quant_dist_lookup_table[ii][jj][1]; + quant_dist_lookup_table[jj][1 - ii]; const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); @@ -524,8 +522,8 @@ class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest DIST_WTD_COMP_PARAMS dist_wtd_comp_params; dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; - dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; - dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1]; int sub_x_q3 = 0; int sub_y_q3 = 0; const int num_loops = 1000000000 / (in_w + in_h); diff --git a/third_party/libaom/source/libaom/test/cpu_used_firstpass_test.cc b/third_party/libaom/source/libaom/test/cpu_used_firstpass_test.cc new file mode 100644 index 0000000000..c970c1977d --- /dev/null +++ b/third_party/libaom/source/libaom/test/cpu_used_firstpass_test.cc @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +const double kPsnrDiffThreshold = 0.1; +const int kFirstPassCpuUsed[] = { 2, 4, 6 }; + +class CpuUsedFirstpassTest : public ::libaom_test::CodecTestWithParam<int>, + public ::libaom_test::EncoderTest { + protected: + CpuUsedFirstpassTest() + : EncoderTest(GET_PARAM(0)), second_pass_cpu_used_(GET_PARAM(1)) {} + virtual ~CpuUsedFirstpassTest() {} + + virtual void SetUp() { + InitializeConfig(::libaom_test::kTwoPassGood); + const aom_rational timebase = { 1, 30 }; + cfg_.g_timebase = timebase; + cfg_.rc_end_usage = AOM_VBR; + cfg_.rc_target_bitrate = 1000; + cfg_.g_lag_in_frames = 19; + cfg_.g_threads = 0; + init_flags_ = AOM_CODEC_USE_PSNR; + } + + virtual void BeginPassHook(unsigned int pass) { + psnr_ = 0.0; + nframes_ = 0; + + if (pass == 0) + cpu_used_ = first_pass_cpu_used_; + else + cpu_used_ = second_pass_cpu_used_; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrDiffThreshold() { return kPsnrDiffThreshold; } + + void DoTest() { + libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 30); + const int size = sizeof(kFirstPassCpuUsed) / sizeof(kFirstPassCpuUsed[0]); + double ref_psnr; + double psnr_diff; + + first_pass_cpu_used_ = second_pass_cpu_used_; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); // same preset case ref_psnr + ref_psnr = GetAveragePsnr(); + + for (int i = 0; i < size; i++) { + first_pass_cpu_used_ = kFirstPassCpuUsed[i]; + if (first_pass_cpu_used_ == second_pass_cpu_used_) continue; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + psnr_diff = abs(ref_psnr - GetAveragePsnr()); + EXPECT_LT(psnr_diff, GetPsnrDiffThreshold()) + << "first pass cpu used = " << first_pass_cpu_used_ + << ", second pass cpu used = " << second_pass_cpu_used_; + } + } + + int cpu_used_; + int first_pass_cpu_used_; + int second_pass_cpu_used_; + unsigned int nframes_; + double psnr_; +}; + +TEST_P(CpuUsedFirstpassTest, FirstPassTest) { DoTest(); } + +class CpuUsedFirstpassTestLarge : public CpuUsedFirstpassTest {}; + +TEST_P(CpuUsedFirstpassTestLarge, FirstPassTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_SUITE(CpuUsedFirstpassTestLarge, + ::testing::Values(2)); // cpu_used + +AV1_INSTANTIATE_TEST_SUITE(CpuUsedFirstpassTest, + ::testing::Values(4, 6)); // cpu_used +} // namespace diff --git a/third_party/libaom/source/libaom/test/datarate_test.cc b/third_party/libaom/source/libaom/test/datarate_test.cc index 2ff074fe8c..71f8b0f37b 100644 --- a/third_party/libaom/source/libaom/test/datarate_test.cc +++ b/third_party/libaom/source/libaom/test/datarate_test.cc @@ -57,7 +57,9 @@ class DatarateTestLarge ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.4) + // FIXME(jingning): Lower this test threshold after vbr mode can render + // sufficiently accurate bit rate. + ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.45) << " The datarate for the file is greater than target by too much!"; } diff --git a/third_party/libaom/source/libaom/test/datarate_test.h b/third_party/libaom/source/libaom/test/datarate_test.h index 0396034874..1b0d515efa 100644 --- a/third_party/libaom/source/libaom/test/datarate_test.h +++ b/third_party/libaom/source/libaom/test/datarate_test.h @@ -63,6 +63,7 @@ class DatarateTest : public ::libaom_test::EncoderTest { encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2); encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2); encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2); + encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2); } } diff --git a/third_party/libaom/source/libaom/test/encode_api_test.cc b/third_party/libaom/source/libaom/test/encode_api_test.cc index eb918460ae..70b0612ced 100644 --- a/third_party/libaom/source/libaom/test/encode_api_test.cc +++ b/third_party/libaom/source/libaom/test/encode_api_test.cc @@ -20,6 +20,12 @@ namespace { +#if CONFIG_REALTIME_ONLY +const int kUsage = 1; +#else +const int kUsage = 0; +#endif + TEST(EncodeAPI, InvalidParams) { uint8_t buf[1] = { 0 }; aom_image_t img; @@ -45,7 +51,7 @@ TEST(EncodeAPI, InvalidParams) { EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, NULL, 0)); EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_config_default(iface, &cfg, 3)); - EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage)); EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0)); EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL)); @@ -63,13 +69,14 @@ TEST(EncodeAPI, InvalidControlId) { aom_codec_iface_t *iface = aom_codec_av1_cx(); aom_codec_ctx_t enc; aom_codec_enc_cfg_t cfg; - EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage)); EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0)); EXPECT_EQ(AOM_CODEC_ERROR, aom_codec_control(&enc, -1, 0)); EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_control(&enc, 0, 0)); EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); } +#if !CONFIG_REALTIME_ONLY TEST(EncodeAPI, AllIntraMode) { aom_codec_iface_t *iface = aom_codec_av1_cx(); aom_codec_ctx_t enc; @@ -93,5 +100,6 @@ TEST(EncodeAPI, AllIntraMode) { cfg.kf_max_dist = 1; EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0)); } +#endif } // namespace diff --git a/third_party/libaom/source/libaom/test/encode_small_width_height_test.cc b/third_party/libaom/source/libaom/test/encode_small_width_height_test.cc index 6f52fd58ef..ad493e5ce0 100644 --- a/third_party/libaom/source/libaom/test/encode_small_width_height_test.cc +++ b/third_party/libaom/source/libaom/test/encode_small_width_height_test.cc @@ -19,11 +19,17 @@ #include "aom/aomcx.h" #include "aom/aom_encoder.h" +#include "config/aom_config.h" namespace { // Dummy buffer of zero samples. constexpr unsigned char kBuffer[256 * 512 + 2 * 128 * 256] = { 0 }; +#if CONFIG_REALTIME_ONLY +const int kUsage = 1; +#else +const int kUsage = 0; +#endif TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) { // The image has only one tile and the tile is two AV1 superblocks wide. @@ -37,7 +43,7 @@ TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) { aom_codec_iface_t *iface = aom_codec_av1_cx(); aom_codec_enc_cfg_t cfg; - EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage)); cfg.g_threads = 2; cfg.g_w = kWidth; cfg.g_h = kHeight; @@ -49,6 +55,7 @@ TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) { EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); } +#if !CONFIG_REALTIME_ONLY TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) { // The image has only one tile and the tile is two AV1 superblocks wide. // For speed 0, superblock size is 128x128 (see av1_select_sb_size()). @@ -72,6 +79,7 @@ TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) { EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0)); EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); } +#endif TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) { // The image has only one tile and the tile is one AV1 superblock tall. @@ -85,7 +93,7 @@ TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) { aom_codec_iface_t *iface = aom_codec_av1_cx(); aom_codec_enc_cfg_t cfg; - EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage)); cfg.g_threads = 2; cfg.g_w = kWidth; cfg.g_h = kHeight; @@ -97,6 +105,7 @@ TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) { EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); } +#if !CONFIG_REALTIME_ONLY TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) { // The image has only one tile and the tile is one AV1 superblock tall. // For speed 0, superblock size is 128x128 (see av1_select_sb_size()). @@ -120,5 +129,5 @@ TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) { EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0)); EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); } - +#endif } // namespace diff --git a/third_party/libaom/source/libaom/test/encode_test_driver.cc b/third_party/libaom/source/libaom/test/encode_test_driver.cc index 058e08e5d7..4a8801f06c 100644 --- a/third_party/libaom/source/libaom/test/encode_test_driver.cc +++ b/third_party/libaom/source/libaom/test/encode_test_driver.cc @@ -226,18 +226,18 @@ void EncoderTest::RunLoop(VideoSource *video) { encoder->EncodeFrame(video, frame_flags_); CxDataIterator iter = encoder->GetCxData(); + bool has_cxdata = false; #if CONFIG_AV1_DECODER - bool has_cxdata = false; bool has_dxdata = false; #endif while (const aom_codec_cx_pkt_t *pkt = iter.Next()) { pkt = MutateEncoderOutputHook(pkt); again = true; switch (pkt->kind) { - case AOM_CODEC_CX_FRAME_PKT: -#if CONFIG_AV1_DECODER + case AOM_CODEC_CX_FRAME_PKT: // has_cxdata = true; +#if CONFIG_AV1_DECODER if (decoder.get() != NULL && DoDecode()) { aom_codec_err_t res_dec; if (DoDecodeInvisible()) { @@ -267,21 +267,27 @@ void EncoderTest::RunLoop(VideoSource *video) { default: break; } } -#if CONFIG_AV1_DECODER - if (has_dxdata && has_cxdata) { + if (has_cxdata) { const aom_image_t *img_enc = encoder->GetPreviewFrame(); - DxDataIterator dec_iter = decoder->GetDxData(); - const aom_image_t *img_dec = dec_iter.Next(); - if (img_enc && img_dec) { - const bool res = - compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL); - if (!res) { // Mismatch - MismatchHook(img_enc, img_dec); + if (img_enc) { + CalculateFrameLevelSSIM(video->img(), img_enc, cfg_.g_bit_depth, + cfg_.g_input_bit_depth); + } +#if CONFIG_AV1_DECODER + if (has_dxdata) { + DxDataIterator dec_iter = decoder->GetDxData(); + const aom_image_t *img_dec = dec_iter.Next(); + if (img_enc && img_dec) { + const bool res = + compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL); + if (!res) { // Mismatch + MismatchHook(img_enc, img_dec); + } } + if (img_dec) DecompressedFrameHook(*img_dec, video->pts()); } - if (img_dec) DecompressedFrameHook(*img_dec, video->pts()); - } #endif + } if (!Continue()) break; } // Loop over spatial layers } diff --git a/third_party/libaom/source/libaom/test/encode_test_driver.h b/third_party/libaom/source/libaom/test/encode_test_driver.h index 5da3ac5d0b..468a41bef3 100644 --- a/third_party/libaom/source/libaom/test/encode_test_driver.h +++ b/third_party/libaom/source/libaom/test/encode_test_driver.h @@ -134,6 +134,11 @@ class Encoder { ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); } + void Control(int ctrl_id, struct aom_ext_part_funcs *arg) { + const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); + ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError(); + } + #if CONFIG_AV1_ENCODER void Control(int ctrl_id, aom_active_map_t *arg) { const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg); @@ -216,6 +221,12 @@ class EncoderTest { // Hook to be called on every first pass stats packet. virtual void StatsPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {} + // Calculates SSIM at frame level. + virtual void CalculateFrameLevelSSIM(const aom_image_t * /*img_src*/, + const aom_image_t * /*img_enc*/, + aom_bit_depth_t /*bit_depth*/, + unsigned int /*input_bit_depth*/) {} + // Hook to determine whether the encode loop should continue. virtual bool Continue() const { return !(::testing::Test::HasFatalFailure() || abort_); diff --git a/third_party/libaom/source/libaom/test/encodemb_test.cc b/third_party/libaom/source/libaom/test/encodemb_test.cc new file mode 100644 index 0000000000..4c725c7dea --- /dev/null +++ b/third_party/libaom/source/libaom/test/encodemb_test.cc @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdint.h> +#include <vector> + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "av1/encoder/block.h" +#include "av1/encoder/encodemb.h" +#include "av1/common/scan.h" + +namespace { + +// Reorders 'qcoeff_lexico', which is in lexicographic order (row by row), into +// scan order (zigzag) in 'qcoeff_scan'. +void ToScanOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_lexico, + tran_low_t *qcoeff_scan) { + const int max_eob = av1_get_max_eob(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + for (int i = 0; i < max_eob; ++i) { + qcoeff_scan[i] = qcoeff_lexico[scan_order->scan[i]]; + } +} + +// Reorders 'qcoeff_scan', which is in scan order (zigzag), into lexicographic +// order (row by row) in 'qcoeff_lexico'. +void ToLexicoOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_scan, + tran_low_t *qcoeff_lexico) { + const int max_eob = av1_get_max_eob(tx_size); + const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type); + for (int i = 0; i < max_eob; ++i) { + qcoeff_lexico[scan_order->scan[i]] = qcoeff_scan[i]; + } +} + +// Runs coefficient dropout on 'qcoeff_scan'. +void Dropout(TX_SIZE tx_size, TX_TYPE tx_type, int dropout_num_before, + int dropout_num_after, tran_low_t *qcoeff_scan) { + tran_low_t qcoeff[MAX_TX_SQUARE]; + // qcoeff_scan is assumed to be in scan order, since tests are easier to + // understand this way, but av1_dropout_qcoeff expects coeffs in lexico order + // so we convert to lexico then back to scan afterwards. + ToLexicoOrder(tx_size, tx_type, qcoeff_scan, qcoeff); + + const int max_eob = av1_get_max_eob(tx_size); + const int kDequantFactor = 10; + tran_low_t dqcoeff[MAX_TX_SQUARE]; + for (int i = 0; i < max_eob; ++i) { + dqcoeff[i] = qcoeff[i] * kDequantFactor; + } + + uint16_t eob = max_eob; + while (eob > 0 && qcoeff_scan[eob - 1] == 0) --eob; + + MACROBLOCK mb; + const int kPlane = 0; + const int kBlock = 0; + memset(&mb, 0, sizeof(mb)); + uint16_t eobs[] = { eob }; + mb.plane[kPlane].eobs = eobs; + mb.plane[kPlane].qcoeff = qcoeff; + mb.plane[kPlane].dqcoeff = dqcoeff; + uint8_t txb_entropy_ctx[1]; + mb.plane[kPlane].txb_entropy_ctx = txb_entropy_ctx; + + av1_dropout_qcoeff_num(&mb, kPlane, kBlock, tx_size, tx_type, + dropout_num_before, dropout_num_after); + + ToScanOrder(tx_size, tx_type, qcoeff, qcoeff_scan); + + // Check updated eob value is valid. + uint16_t new_eob = max_eob; + while (new_eob > 0 && qcoeff_scan[new_eob - 1] == 0) --new_eob; + EXPECT_EQ(new_eob, mb.plane[kPlane].eobs[0]); + + // Check qqcoeff is still valid. + for (int i = 0; i < max_eob; ++i) { + EXPECT_EQ(qcoeff[i] * kDequantFactor, dqcoeff[i]); + } +} + +void ExpectArrayEq(tran_low_t *actual, std::vector<tran_low_t> expected) { + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_EQ(expected[i], actual[i]) << "Arrays differ at index " << i; + } +} + +static constexpr TX_TYPE kTxType = DCT_DCT; + +TEST(DropoutTest, KeepsLargeCoeffs) { + const TX_SIZE tx_size = TX_8X4; + const uint32_t dropout_num_before = 4; + const uint32_t dropout_num_after = 6; + // Large isolated coeffs should be preserved. + tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 0, 0, 42, 0, // should be kept + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, -30, // should be kept + 0, 0, 0, 0, 0, 0, 0, 0 }; + Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan); + ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 42, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, -30, // + 0, 0, 0, 0, 0, 0, 0, 0 }); +} + +TEST(DropoutTest, RemovesSmallIsolatedCoeffs) { + const TX_SIZE tx_size = TX_8X4; + const uint32_t dropout_num_before = 4; + const uint32_t dropout_num_after = 6; + // Small isolated coeffs should be removed. + tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0, 0, 0, // should be removed + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, -2, 0, 0, 0, // should be removed + 0, 0, 0, 0, 0, 0, 0, 0 }; + Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan); + ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0 }); +} + +TEST(DropoutTest, KeepsSmallCoeffsAmongLargeOnes) { + const TX_SIZE tx_size = TX_8X4; + const uint32_t dropout_num_before = 4; + const uint32_t dropout_num_after = 6; + // Small coeffs that are not isolated (not enough zeros before/after should be + // kept). + tran_low_t qcoeff_scan[] = { + 1, 0, 0, 0, -5, 0, 0, -1, // should be kept + 0, 0, 0, 10, 0, 0, 2, 0, // should be kept + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, -2, 0, 0, 0, 0, 0, 0 // should be removed + }; // should be removed + Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan); + ExpectArrayEq(qcoeff_scan, { 1, 0, 0, 0, -5, 0, 0, -1, // + 0, 0, 0, 10, 0, 0, 2, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0 }); +} + +TEST(DropoutTest, KeepsSmallCoeffsCloseToStartOrEnd) { + const TX_SIZE tx_size = TX_8X4; + const uint32_t dropout_num_before = 4; + const uint32_t dropout_num_after = 6; + // Small coeffs that are too close to the beginning or end of the block + // should also be kept (not enough zeroes before/after). + tran_low_t qcoeff_scan[] = { 0, 0, -1, 0, 0, 0, 0, 0, // should be kept + 0, 0, 0, 10, 0, 0, 0, 0, // should be kept + 0, 0, 0, 2, 0, 0, 0, 0, // should be removed + 0, 0, 0, 0, 0, 0, -1, 0 }; // should be kept + Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan); + ExpectArrayEq(qcoeff_scan, { 0, 0, -1, 0, 0, 0, 0, 0, // + 0, 0, 0, 10, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, -1, 0 }); +} + +TEST(DropoutTest, RemovesSmallClusterOfCoeffs) { + const TX_SIZE tx_size = TX_8X4; + const uint32_t dropout_num_before = 4; + const uint32_t dropout_num_after = 6; + // Small clusters (<= kDropoutContinuityMax) of small coeffs should be + // removed. + tran_low_t qcoeff_scan_two[] = { + 0, 0, 0, 0, 1, 0, 0, -1, // should be removed + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 1, 0, // should be removed + 0, 0, 0, 0, 0, 0, 0, 0 + }; + Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, + qcoeff_scan_two); + ExpectArrayEq(qcoeff_scan_two, { 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0 }); +} + +TEST(DropoutTest, KeepsLargeClusterOfCoeffs) { + const TX_SIZE tx_size = TX_8X4; + const uint32_t dropout_num_before = 4; + const uint32_t dropout_num_after = 6; + // Large clusters (> kDropoutContinuityMax) of small coeffs should be kept. + tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0, 1, -1, // should be kept + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, -2, 0, 0, // should be removed + 0, 0, 0, 0, 0, 0, 0, 0 }; + Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan); + ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 1, 0, 1, -1, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0 }); +} + +TEST(DropoutTest, NumBeforeLargerThanNumAfter) { + const TX_SIZE tx_size = TX_8X4; + const uint32_t dropout_num_before = 4; + const uint32_t dropout_num_after = 2; + // The second coeff (-2) doesn't seem to meet the dropout_num_before + // criteria. But since the first coeff (1) will be dropped, it will meet + // the criteria and should be dropped too. + tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0, 0, 0, // should be removed + -2, 0, 0, 0, 0, 0, 0, 0, // should be removed + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0 }; + Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan); + ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0 }); +} + +// More complex test combining other test cases. +TEST(DropoutTest, ComplexTest) { + const TX_SIZE tx_size = TX_8X8; + const uint32_t dropout_num_before = 4; + const uint32_t dropout_num_after = 2; + tran_low_t qcoeff_scan[] = { 1, 12, 0, 0, 0, 0, 1, 0, // + 0, 0, 0, -12, 0, 0, 0, 1, // + 0, 0, -2, 0, 1, 0, 0, 1, // + 0, 0, 0, 0, 5, 0, -1, 0, // + 0, 0, 0, 1, 0, 0, 0, -1, // + 0, 0, 0, 0, 2, 0, 0, 0, // + 0, 1, 0, 0, 0, 5, 0, 0, // + 0, 0, 1, 1, 0, 0, 0, -2 }; + Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan); + ExpectArrayEq(qcoeff_scan, { 1, 12, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, -12, 0, 0, 0, 1, // + 0, 0, -2, 0, 1, 0, 0, 1, // + 0, 0, 0, 0, 5, 0, -1, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, 0, // + 0, 0, 0, 0, 0, 5, 0, 0, // + 0, 0, 0, 0, 0, 0, 0, -2 }); +} + +} // namespace diff --git a/third_party/libaom/source/libaom/test/end_to_end_psnr_test.cc b/third_party/libaom/source/libaom/test/end_to_end_psnr_test.cc new file mode 100644 index 0000000000..5574c1a909 --- /dev/null +++ b/third_party/libaom/source/libaom/test/end_to_end_psnr_test.cc @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <memory> +#include <ostream> + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +namespace { + +const unsigned int kWidth = 160; +const unsigned int kHeight = 90; +const unsigned int kFramerate = 50; +const unsigned int kFrames = 10; +const int kBitrate = 500; +const unsigned int kCqLevel = 18; +// List of psnr thresholds for speed settings 0-8 and 4 encoding modes +const double kPsnrThreshold[][4] = { + { 35.7, 44.4, 39.5, 41.9 }, { 35.7, 44.4, 39.5, 41.9 }, + { 35.7, 44.4, 39.4, 41.9 }, { 35.7, 44.4, 39.1, 41.8 }, + { 35.6, 44.4, 39.1, 41.8 }, { 35.0, 44.3, 38.7, 41.8 }, + { 35.0, 44.3, 38.7, 41.3 }, { 35.0, 44.3, 38.7, 40.8 }, + { 35.0, 44.3, 38.7, 40.8 } +}; + +typedef struct { + const char *filename; + unsigned int input_bit_depth; + aom_img_fmt fmt; + aom_bit_depth_t bit_depth; + unsigned int profile; +} TestVideoParam; + +std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) { + return os << "TestVideoParam { filename:" << test_arg.filename + << " input_bit_depth:" << test_arg.input_bit_depth + << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth + << " profile:" << test_arg.profile << " }"; +} + +const TestVideoParam kTestVectors[] = { + { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 }, + { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 }, + { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 }, +#if CONFIG_AV1_HIGHBITDEPTH + { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 }, + { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 }, + { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 }, + { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 }, + { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 }, + { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 }, +#endif +}; + +// Encoding modes tested +const libaom_test::TestMode kEncodingModeVectors[] = { + ::libaom_test::kTwoPassGood, + ::libaom_test::kOnePassGood, + ::libaom_test::kRealTime, +}; + +// Speed settings tested +const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 }; + +int is_extension_y4m(const char *filename) { + const char *dot = strrchr(filename, '.'); + if (!dot || dot == filename) + return 0; + else + return !strcmp(dot, ".y4m"); +} + +class EndToEndTest + : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, + TestVideoParam, int>, + public ::libaom_test::EncoderTest { + protected: + EndToEndTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)), + cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0), + encoding_mode_(GET_PARAM(1)) {} + + virtual ~EndToEndTest() {} + + virtual void SetUp() { + InitializeConfig(encoding_mode_); + if (encoding_mode_ == ::libaom_test::kOnePassGood || + encoding_mode_ == ::libaom_test::kTwoPassGood) { + cfg_.g_lag_in_frames = 5; + } else if (encoding_mode_ == ::libaom_test::kRealTime) { + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 4); + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + // Test screen coding tools at cpu_used = 1 && encoding mode is two-pass. + if (cpu_used_ == 1 && encoding_mode_ == ::libaom_test::kTwoPassGood) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); + else + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT); + if (encoding_mode_ == ::libaom_test::kOnePassGood || + encoding_mode_ == ::libaom_test::kTwoPassGood) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } else if (encoding_mode_ == ::libaom_test::kAllIntra) { + encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel); + } + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { + return kPsnrThreshold[cpu_used_][encoding_mode_]; + } + + void DoTest() { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr<libaom_test::VideoSource> video; + if (is_extension_y4m(test_video_param_.filename)) { + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + } else { + video.reset(new libaom_test::YUVVideoSource( + test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, + kFramerate, 1, 0, kFrames)); + } + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()) + << "cpu used = " << cpu_used_ << ", encoding mode = " << encoding_mode_; + } + + TestVideoParam test_video_param_; + int cpu_used_; + + private: + double psnr_; + unsigned int nframes_; + libaom_test::TestMode encoding_mode_; +}; + +class EndToEndTestLarge : public EndToEndTest {}; + +class EndToEndAllIntraTestLarge : public EndToEndTest {}; + +class EndToEndAllIntraTest : public EndToEndTest {}; + +TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); } + +TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); } + +TEST_P(EndToEndAllIntraTestLarge, EndtoEndPSNRTest) { DoTest(); } + +TEST_P(EndToEndAllIntraTest, EndtoEndPSNRTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_SUITE(EndToEndTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kCpuUsedVectors)); + +AV1_INSTANTIATE_TEST_SUITE(EndToEndTest, + ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::Values(kTestVectors[2]), // 444 + ::testing::Values(3)); // cpu_used + +AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTestLarge, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::ValuesIn(kTestVectors), + ::testing::Values(2, 4, 6, 8)); // cpu_used + +AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTest, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::Values(kTestVectors[0]), // 420 + ::testing::Values(6)); // cpu_used +} // namespace diff --git a/third_party/libaom/source/libaom/test/end_to_end_ssim_test.cc b/third_party/libaom/source/libaom/test/end_to_end_ssim_test.cc new file mode 100644 index 0000000000..1e638d7b45 --- /dev/null +++ b/third_party/libaom/source/libaom/test/end_to_end_ssim_test.cc @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/mem.h" +#include "aom_dsp/ssim.h" +#include "av1/common/blockd.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +const unsigned int kFrames = 10; +const unsigned int kCqLevel = 18; +// List of ssim thresholds for speed settings 0-8 with all intra encoding mode. +const double kSsimThreshold[] = { 83.4, 83.4, 83.4, 83.3, 83.3, + 83.0, 82.3, 81.1, 81.1 }; + +typedef struct { + const char *filename; + unsigned int input_bit_depth; + aom_img_fmt fmt; + aom_bit_depth_t bit_depth; + unsigned int profile; +} TestVideoParam; + +std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) { + return os << "TestVideoParam { filename:" << test_arg.filename + << " input_bit_depth:" << test_arg.input_bit_depth + << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth + << " profile:" << test_arg.profile << " }"; +} + +const TestVideoParam kTestVectors[] = { + { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 }, + { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 }, + { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 }, +#if CONFIG_AV1_HIGHBITDEPTH + { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 }, + { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 }, + { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 }, + { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 }, + { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 }, + { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 }, +#endif +}; + +// This class is used to check adherence to given ssim value. +class EndToEndSSIMTest + : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, + TestVideoParam, int>, + public ::libaom_test::EncoderTest { + protected: + EndToEndSSIMTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0), + ssim_(0.0) {} + + ~EndToEndSSIMTest() override {} + + void SetUp() override { InitializeConfig(encoding_mode_); } + + void BeginPassHook(unsigned int) override { + nframes_ = 0; + ssim_ = 0.0; + } + + void CalculateFrameLevelSSIM(const aom_image_t *img_src, + const aom_image_t *img_enc, + aom_bit_depth_t bit_depth, + unsigned int input_bit_depth) override { + double frame_ssim; + double plane_ssim[MAX_MB_PLANE] = { 0.0, 0.0, 0.0 }; + int crop_widths[PLANE_TYPES]; + int crop_heights[PLANE_TYPES]; + crop_widths[PLANE_TYPE_Y] = img_src->d_w; + crop_heights[PLANE_TYPE_Y] = img_src->d_h; + // Width of UV planes calculated based on chroma_shift values. + crop_widths[PLANE_TYPE_UV] = + img_src->x_chroma_shift == 1 ? (img_src->w + 1) >> 1 : img_src->w; + crop_heights[PLANE_TYPE_UV] = + img_src->y_chroma_shift == 1 ? (img_src->h + 1) >> 1 : img_src->h; + nframes_++; + +#if CONFIG_AV1_HIGHBITDEPTH + uint8_t is_hbd = bit_depth > AOM_BITS_8; + if (is_hbd) { + // HBD ssim calculation. + uint8_t shift = bit_depth - input_bit_depth; + for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) { + const int is_uv = i > AOM_PLANE_Y; + plane_ssim[i] = aom_highbd_ssim2( + CONVERT_TO_BYTEPTR(img_src->planes[i]), + CONVERT_TO_BYTEPTR(img_enc->planes[i]), + img_src->stride[is_uv] >> is_hbd, img_enc->stride[is_uv] >> is_hbd, + crop_widths[is_uv], crop_heights[is_uv], input_bit_depth, shift); + } + frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 + + .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]); + // Accumulate to find sequence level ssim value. + ssim_ += frame_ssim; + return; + } +#else + (void)bit_depth; + (void)input_bit_depth; +#endif // CONFIG_AV1_HIGHBITDEPTH + + // LBD ssim calculation. + for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) { + const int is_uv = i > AOM_PLANE_Y; + plane_ssim[i] = aom_ssim2(img_src->planes[i], img_enc->planes[i], + img_src->stride[is_uv], img_enc->stride[is_uv], + crop_widths[is_uv], crop_heights[is_uv]); + } + frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 + + .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]); + // Accumulate to find sequence level ssim value. + ssim_ += frame_ssim; + } + + void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 4); + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AOME_SET_TUNING, AOM_TUNE_SSIM); + encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel); + } + } + + double GetAverageSsim() const { + if (nframes_) return 100 * pow(ssim_ / nframes_, 8.0); + return 0.0; + } + + double GetSsimThreshold() { return kSsimThreshold[cpu_used_]; } + + void DoTest() { + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr<libaom_test::VideoSource> video( + new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + ASSERT_TRUE(video.get() != NULL); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double ssim = GetAverageSsim(); + EXPECT_GT(ssim, GetSsimThreshold()) + << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_; + } + + private: + const libaom_test::TestMode encoding_mode_; + const TestVideoParam test_video_param_; + const int cpu_used_; + unsigned int nframes_; + double ssim_; +}; + +class EndToEndSSIMTestLarge : public EndToEndSSIMTest {}; + +TEST_P(EndToEndSSIMTestLarge, EndtoEndSSIMTest) { DoTest(); } + +TEST_P(EndToEndSSIMTest, EndtoEndSSIMTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTestLarge, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::ValuesIn(kTestVectors), + ::testing::Values(2, 4, 6, 8)); // cpu_used + +AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTest, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::Values(kTestVectors[0]), // 420 + ::testing::Values(6)); // cpu_used +} // namespace diff --git a/third_party/libaom/source/libaom/test/error_resilience_test.cc b/third_party/libaom/source/libaom/test/error_resilience_test.cc index 31906a47d0..3999c9146d 100644 --- a/third_party/libaom/source/libaom/test/error_resilience_test.cc +++ b/third_party/libaom/source/libaom/test/error_resilience_test.cc @@ -358,6 +358,10 @@ TEST_P(ErrorResilienceTestLarge, OnVersusOff) { // if we lose (i.e., drop before decoding) a set of droppable // frames (i.e., frames that don't update any reference buffers). TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) { + if (GET_PARAM(1) == ::libaom_test::kOnePassGood && GET_PARAM(2) == 1) { + fprintf(stderr, "Skipping test case #1 because of bug aomedia:3002\n"); + return; + } SetupEncoder(500, 10); libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, cfg_.g_timebase.den, cfg_.g_timebase.num, diff --git a/third_party/libaom/source/libaom/test/ethread_test.cc b/third_party/libaom/source/libaom/test/ethread_test.cc index 5bf8762052..78811b65cf 100644 --- a/third_party/libaom/source/libaom/test/ethread_test.cc +++ b/third_party/libaom/source/libaom/test/ethread_test.cc @@ -21,6 +21,9 @@ #include "av1/encoder/firstpass.h" namespace { +const unsigned int kCqLevel = 18; + +#if !CONFIG_REALTIME_ONLY const size_t kFirstPassStatsSz = sizeof(FIRSTPASS_STATS); class AVxFirstPassEncoderThreadTest : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int, @@ -196,6 +199,7 @@ TEST_P(AVxFirstPassEncoderThreadTest, FirstPassStatsTest) { // Comparison 4 (between threads=4 and threads=8). compare_fp_stats_md5(&firstpass_stats); } +#endif // !CONFIG_REALTIME_ONLY class AVxEncoderThreadTest : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int, @@ -227,11 +231,12 @@ class AVxEncoderThreadTest virtual void SetUp() { InitializeConfig(encoding_mode_); - if (encoding_mode_ != ::libaom_test::kRealTime) { + if (encoding_mode_ == ::libaom_test::kOnePassGood || + encoding_mode_ == ::libaom_test::kTwoPassGood) { cfg_.g_lag_in_frames = 6; cfg_.rc_2pass_vbr_minsection_pct = 5; cfg_.rc_2pass_vbr_maxsection_pct = 2000; - } else { + } else if (encoding_mode_ == ::libaom_test::kRealTime) { cfg_.g_error_resilient = 1; } cfg_.rc_max_quantizer = 56; @@ -248,18 +253,22 @@ class AVxEncoderThreadTest SetTileSize(encoder); encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); encoder->Control(AV1E_SET_ROW_MT, row_mt_); - if (encoding_mode_ != ::libaom_test::kRealTime) { + if (encoding_mode_ == ::libaom_test::kOnePassGood || + encoding_mode_ == ::libaom_test::kTwoPassGood) { encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); encoder->Control(AOME_SET_ARNR_MAXFRAMES, 5); encoder->Control(AOME_SET_ARNR_STRENGTH, 5); encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0); encoder->Control(AV1E_SET_MAX_GF_INTERVAL, 4); - } else { + } else if (encoding_mode_ == ::libaom_test::kRealTime) { encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0); encoder->Control(AV1E_SET_AQ_MODE, 3); encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2); encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2); encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 3); + encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 3); + } else { + encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel); } encoder_initialized_ = true; } @@ -423,31 +432,60 @@ class AVxEncoderThreadTest std::vector<std::string> md5_dec_; }; -TEST_P(AVxEncoderThreadTest, EncoderResultTest) { +class AVxEncoderThreadRTTest : public AVxEncoderThreadTest {}; + +TEST_P(AVxEncoderThreadRTTest, EncoderResultTest) { cfg_.large_scale_tile = 0; decoder_->Control(AV1_SET_TILE_MODE, 0); DoTest(); } -class AVxEncoderThreadRTTest : public AVxEncoderThreadTest {}; +// For real time mode, test speed 6, 7, 8, 9. +AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Values(6, 7, 8, 9), + ::testing::Values(0, 2), ::testing::Values(0, 2), + ::testing::Values(0, 1)); -TEST_P(AVxEncoderThreadRTTest, EncoderResultTest) { +#if !CONFIG_REALTIME_ONLY + +// The AVxEncoderThreadTestLarge takes up ~14% of total run-time of the +// Valgrind long tests. Exclude it; the smaller tests are still run. +#if !AOM_VALGRIND_BUILD +class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {}; + +TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) { cfg_.large_scale_tile = 0; decoder_->Control(AV1_SET_TILE_MODE, 0); DoTest(); } -class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {}; +// Test cpu_used 0, 1, 3 and 5. +AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge, + ::testing::Values(::libaom_test::kTwoPassGood, + ::libaom_test::kOnePassGood), + ::testing::Values(0, 1, 3, 5), + ::testing::Values(1, 6), ::testing::Values(1, 6), + ::testing::Values(0, 1)); +#endif // !AOM_VALGRIND_BUILD -TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) { +TEST_P(AVxEncoderThreadTest, EncoderResultTest) { cfg_.large_scale_tile = 0; decoder_->Control(AV1_SET_TILE_MODE, 0); DoTest(); } -class AVxEncoderThreadRTTestLarge : public AVxEncoderThreadTest {}; +class AVxEncoderThreadAllIntraTest : public AVxEncoderThreadTest {}; -TEST_P(AVxEncoderThreadRTTestLarge, EncoderResultTest) { +TEST_P(AVxEncoderThreadAllIntraTest, EncoderResultTest) { + cfg_.large_scale_tile = 0; + decoder_->Control(AV1_SET_TILE_MODE, 0); + DoTest(); +} + +class AVxEncoderThreadAllIntraTestLarge : public AVxEncoderThreadTest {}; + +TEST_P(AVxEncoderThreadAllIntraTestLarge, EncoderResultTest) { cfg_.large_scale_tile = 0; decoder_->Control(AV1_SET_TILE_MODE, 0); DoTest(); @@ -466,26 +504,20 @@ AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTest, ::testing::Values(2), ::testing::Values(0, 2), ::testing::Values(0, 2), ::testing::Values(0, 1)); -// Test cpu_used 7, 8, 9 here. -AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest, - ::testing::Values(::libaom_test::kRealTime), - ::testing::Values(7, 8, 9), ::testing::Values(0, 2), +// For all intra mode, test speed 0, 2, 4, 6, 8. +// Only test cpu_used 6 here. +AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTest, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::Values(6), ::testing::Values(0, 2), ::testing::Values(0, 2), ::testing::Values(0, 1)); -// Test cpu_used 0, 1, 3 and 5. -AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge, - ::testing::Values(::libaom_test::kTwoPassGood, - ::libaom_test::kOnePassGood), - ::testing::Values(0, 1, 3, 5), - ::testing::Values(1, 6), ::testing::Values(1, 6), - ::testing::Values(0, 1)); - -// Test cpu_used 0, 2, 4 and 6. -AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTestLarge, - ::testing::Values(::libaom_test::kRealTime), - ::testing::Values(0, 2, 4, 6), +// Test cpu_used 0, 2, 4 and 8. +AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTestLarge, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::Values(0, 2, 4, 8), ::testing::Values(1, 6), ::testing::Values(1, 6), ::testing::Values(0, 1)); +#endif // !CONFIG_REALTIME_ONLY class AVxEncoderThreadLSTest : public AVxEncoderThreadTest { virtual void SetTileSize(libaom_test::Encoder *encoder) { @@ -512,6 +544,10 @@ TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) { DoTest(); } +// AVxEncoderThreadLSTestLarge takes up about 2% of total run-time of +// the Valgrind long tests. Since we already run AVxEncoderThreadLSTest, +// skip this one for Valgrind. +#if !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {}; TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) { @@ -526,4 +562,5 @@ AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadLSTestLarge, ::libaom_test::kOnePassGood), ::testing::Values(1, 3), ::testing::Values(0, 6), ::testing::Values(0, 6), ::testing::Values(1)); +#endif // !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD } // namespace diff --git a/third_party/libaom/source/libaom/test/external_frame_buffer_test.cc b/third_party/libaom/source/libaom/test/external_frame_buffer_test.cc index 5006b5b6cf..b060ee3913 100644 --- a/third_party/libaom/source/libaom/test/external_frame_buffer_test.cc +++ b/third_party/libaom/source/libaom/test/external_frame_buffer_test.cc @@ -199,6 +199,7 @@ int do_not_release_aom_frame_buffer(void *user_priv, #endif // CONFIG_WEBM_IO +#if !CONFIG_REALTIME_ONLY // Class for testing passing in external frame buffers to libaom. class ExternalFrameBufferMD5Test : public ::libaom_test::DecoderTest, @@ -298,6 +299,7 @@ class ExternalFrameBufferMD5Test int num_buffers_; ExternalFrameBufferList fb_list_; }; +#endif // !CONFIG_REALTIME_ONLY #if CONFIG_WEBM_IO const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv"; @@ -395,6 +397,7 @@ class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { }; #endif // CONFIG_WEBM_IO +#if !CONFIG_REALTIME_ONLY // This test runs through the set of test vectors, and decodes them. // Libaom will call into the application to allocate a frame buffer when // needed. The md5 checksums are computed for each frame in the video file. @@ -438,6 +441,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) { // Decode frame, and check the md5 matching. ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); } +#endif // !CONFIG_REALTIME_ONLY #if CONFIG_WEBM_IO TEST_F(ExternalFrameBufferTest, MinFrameBuffers) { @@ -447,7 +451,11 @@ TEST_F(ExternalFrameBufferTest, MinFrameBuffers) { ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, release_aom_frame_buffer)); +#if CONFIG_REALTIME_ONLY + ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames()); +#else ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames()); +#endif } TEST_F(ExternalFrameBufferTest, EightJitterBuffers) { @@ -459,7 +467,11 @@ TEST_F(ExternalFrameBufferTest, EightJitterBuffers) { ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, release_aom_frame_buffer)); +#if CONFIG_REALTIME_ONLY + ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames()); +#else ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames()); +#endif } TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) { @@ -470,10 +482,14 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) { ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, release_aom_frame_buffer)); +#if CONFIG_REALTIME_ONLY + ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame()); +#else ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame()); // Only run this on long clips. Decoding a very short clip will return // AOM_CODEC_OK even with only 2 buffers. ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames()); +#endif } TEST_F(ExternalFrameBufferTest, NoRelease) { @@ -481,8 +497,12 @@ TEST_F(ExternalFrameBufferTest, NoRelease) { ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, do_not_release_aom_frame_buffer)); +#if CONFIG_REALTIME_ONLY + ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame()); +#else ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame()); ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames()); +#endif } TEST_F(ExternalFrameBufferTest, NullRealloc) { @@ -515,11 +535,15 @@ TEST_F(ExternalFrameBufferTest, NullReleaseFunction) { } TEST_F(ExternalFrameBufferTest, SetAfterDecode) { +#if CONFIG_REALTIME_ONLY + ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame()); +#else const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS; ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame()); ASSERT_EQ(AOM_CODEC_ERROR, SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, release_aom_frame_buffer)); +#endif } TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { @@ -527,14 +551,20 @@ TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, release_aom_frame_buffer)); +#if CONFIG_REALTIME_ONLY + ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames()); +#else ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames()); +#endif CheckFrameBufferRelease(); } #endif // CONFIG_WEBM_IO +#if !CONFIG_REALTIME_ONLY AV1_INSTANTIATE_TEST_SUITE( ExternalFrameBufferMD5Test, ::testing::ValuesIn(libaom_test::kAV1TestVectors, libaom_test::kAV1TestVectors + libaom_test::kNumAV1TestVectors)); +#endif } // namespace diff --git a/third_party/libaom/source/libaom/test/film_grain_table_test.cc b/third_party/libaom/source/libaom/test/film_grain_table_test.cc index 524d67d7bc..31fb908ffa 100644 --- a/third_party/libaom/source/libaom/test/film_grain_table_test.cc +++ b/third_party/libaom/source/libaom/test/film_grain_table_test.cc @@ -101,6 +101,20 @@ TEST(FilmGrainTableTest, AddAndLookupSingleSegment) { aom_film_grain_table_free(&table); } +TEST(FilmGrainTableTest, AddSingleSegmentRemoveBiggerSegment) { + aom_film_grain_table_t table; + aom_film_grain_t grain; + + memset(&table, 0, sizeof(table)); + + aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0); + EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 1100, true, &grain)); + + EXPECT_EQ(0, table.head); + EXPECT_EQ(0, table.tail); + aom_film_grain_table_free(&table); +} + TEST(FilmGrainTableTest, SplitSingleSegment) { aom_film_grain_table_t table; aom_film_grain_t grain; diff --git a/third_party/libaom/source/libaom/test/frame_size_tests.cc b/third_party/libaom/source/libaom/test/frame_size_tests.cc index 38b6a63c3d..2365a20c24 100644 --- a/third_party/libaom/source/libaom/test/frame_size_tests.cc +++ b/third_party/libaom/source/libaom/test/frame_size_tests.cc @@ -73,6 +73,7 @@ TEST_F(AV1FrameSizeTests, OneByOneVideo) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } +#if !CONFIG_REALTIME_ONLY typedef struct { unsigned int width; unsigned int height; @@ -129,5 +130,6 @@ TEST_P(AV1LosslessFrameSizeTests, LosslessEncode) { AV1_INSTANTIATE_TEST_SUITE(AV1LosslessFrameSizeTests, ::testing::ValuesIn(FrameSizeTestParams), testing::Values(::libaom_test::kAllIntra)); +#endif // !CONFIG_REALTIME_ONLY } // namespace diff --git a/third_party/libaom/source/libaom/test/hbd_metrics_test.cc b/third_party/libaom/source/libaom/test/hbd_metrics_test.cc index 8044b516c1..39c2b4c101 100644 --- a/third_party/libaom/source/libaom/test/hbd_metrics_test.cc +++ b/third_party/libaom/source/libaom/test/hbd_metrics_test.cc @@ -88,7 +88,7 @@ double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source, double compute_aomssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest) { double ssim, weight; - aom_calc_ssim(source, dest, &weight, &ssim); + aom_lowbd_calc_ssim(source, dest, &weight, &ssim); return 100 * pow(ssim / weight, 8.0); } diff --git a/third_party/libaom/source/libaom/test/horz_superres_test.cc b/third_party/libaom/source/libaom/test/horz_superres_test.cc index 9733344111..2f0f3fdb6a 100644 --- a/third_party/libaom/source/libaom/test/horz_superres_test.cc +++ b/third_party/libaom/source/libaom/test/horz_superres_test.cc @@ -52,7 +52,7 @@ std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) { } const TestVideoParam kTestVideoVectors[] = { - { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5, + { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.4, 45.0 }, #if CONFIG_AV1_HIGHBITDEPTH { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 27.0, diff --git a/third_party/libaom/source/libaom/test/intrabc_test.cc b/third_party/libaom/source/libaom/test/intrabc_test.cc index b57eb6fab5..2c60596ab8 100644 --- a/third_party/libaom/source/libaom/test/intrabc_test.cc +++ b/third_party/libaom/source/libaom/test/intrabc_test.cc @@ -153,8 +153,10 @@ TEST(IntrabcTest, DvValidation) { xd.plane[2].subsampling_x = 1; xd.plane[2].subsampling_y = 1; + SequenceHeader seq_params = {}; AV1_COMMON cm; memset(&cm, 0, sizeof(cm)); + cm.seq_params = &seq_params; for (const DvTestCase &dv_case : kDvCases) { const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset; diff --git a/third_party/libaom/source/libaom/test/invalid_file_test.cc b/third_party/libaom/source/libaom/test/invalid_file_test.cc index 77839fafcd..6ac8d1ac32 100644 --- a/third_party/libaom/source/libaom/test/invalid_file_test.cc +++ b/third_party/libaom/source/libaom/test/invalid_file_test.cc @@ -151,6 +151,7 @@ const DecodeParam kAV1InvalidFileTests[] = { { 1, "invalid-oss-fuzz-10779.ivf", NULL }, { 1, "invalid-oss-fuzz-11477.ivf", NULL }, { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" }, + { 1, "invalid-oss-fuzz-33030.ivf", NULL }, #endif }; diff --git a/third_party/libaom/source/libaom/test/kf_test.cc b/third_party/libaom/source/libaom/test/kf_test.cc index cc2cc89c2b..2d228f2fef 100644 --- a/third_party/libaom/source/libaom/test/kf_test.cc +++ b/third_party/libaom/source/libaom/test/kf_test.cc @@ -100,10 +100,36 @@ class KeyFrameIntervalTestLarge aom_rc_mode end_usage_check_; }; +// Because valgrind builds take a very long time to run, use a lower +// resolution video for valgrind runs. +const char *TestFileName() { +#if AOM_VALGRIND_BUILD + return "hantro_collage_w176h144.yuv"; +#else + return "hantro_collage_w352h288.yuv"; +#endif // AOM_VALGRIND_BUILD +} + +int TestFileWidth() { +#if AOM_VALGRIND_BUILD + return 176; +#else + return 352; +#endif // AOM_VALGRIND_BUILD +} + +int TestFileHeight() { +#if AOM_VALGRIND_BUILD + return 144; +#else + return 288; +#endif // AOM_VALGRIND_BUILD +} + TEST_P(KeyFrameIntervalTestLarge, KeyFrameIntervalTest) { - libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - cfg_.g_timebase.den, cfg_.g_timebase.num, - 0, 75); + libaom_test::I420VideoSource video(TestFileName(), TestFileWidth(), + TestFileHeight(), cfg_.g_timebase.den, + cfg_.g_timebase.num, 0, 75); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_EQ(is_kf_interval_violated_, false) << kf_dist_param_; } @@ -187,9 +213,9 @@ TEST_P(ForcedKeyTestLarge, Frame1IsKey) { frame_num_ = 0; cfg_.g_lag_in_frames = lag_values[i]; is_kf_placement_violated_ = false; - libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - timebase.den, timebase.num, 0, - fwd_kf_enabled_ ? 60 : 30); + libaom_test::I420VideoSource video( + TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den, + timebase.num, 0, fwd_kf_enabled_ ? 60 : 30); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_EQ(is_kf_placement_violated_, false) << "Frame #" << frame_num_ << " isn't a keyframe!"; @@ -207,9 +233,9 @@ TEST_P(ForcedKeyTestLarge, ForcedFrameIsKey) { forced_kf_frame_num_ = lag_values[i] - 1; cfg_.g_lag_in_frames = lag_values[i]; is_kf_placement_violated_ = false; - libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - timebase.den, timebase.num, 0, - fwd_kf_enabled_ ? 60 : 30); + libaom_test::I420VideoSource video( + TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den, + timebase.num, 0, fwd_kf_enabled_ ? 60 : 30); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_EQ(is_kf_placement_violated_, false) << "Frame #" << frame_num_ << " isn't a keyframe!"; @@ -237,9 +263,9 @@ TEST_P(ForcedKeyTestLarge, ForcedFrameIsKeyCornerCases) { forced_kf_frame_num_ = (int)cfg_.kf_max_dist + kf_offsets[i]; forced_kf_frame_num_ = forced_kf_frame_num_ > 0 ? forced_kf_frame_num_ : 1; is_kf_placement_violated_ = false; - libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - timebase.den, timebase.num, 0, - fwd_kf_enabled_ ? 60 : 30); + libaom_test::I420VideoSource video( + TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den, + timebase.num, 0, fwd_kf_enabled_ ? 60 : 30); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_EQ(is_kf_placement_violated_, false) << "Frame #" << frame_num_ << " isn't a keyframe!"; diff --git a/third_party/libaom/source/libaom/test/lossless_test.cc b/third_party/libaom/source/libaom/test/lossless_test.cc index 92ab299ea9..c14bc06e5e 100644 --- a/third_party/libaom/source/libaom/test/lossless_test.cc +++ b/third_party/libaom/source/libaom/test/lossless_test.cc @@ -24,13 +24,14 @@ namespace { const int kMaxPsnr = 100; class LosslessTestLarge - : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, - aom_rc_mode>, + : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, + aom_rc_mode, int>, public ::libaom_test::EncoderTest { protected: LosslessTestLarge() : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0), - encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)) {} + encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)), + cpu_used_(GET_PARAM(3)) {} virtual ~LosslessTestLarge() {} @@ -47,6 +48,7 @@ class LosslessTestLarge if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) { encoder->Control(AV1E_SET_LOSSLESS, 1); } + encoder->Control(AOME_SET_CPUUSED, cpu_used_); } } @@ -79,6 +81,7 @@ class LosslessTestLarge unsigned int nframes_; libaom_test::TestMode encoding_mode_; aom_rc_mode rc_end_usage_; + int cpu_used_; int base_qindex_; }; @@ -136,8 +139,33 @@ TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) { EXPECT_GE(psnr_lossless, kMaxPsnr); } +class LosslessAllIntraTestLarge : public LosslessTestLarge {}; + +TEST_P(LosslessAllIntraTestLarge, TestLossLessEncodingCtrl) { + const aom_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + // Intentionally set Q > 0, to make sure control can be used to activate + // lossless + cfg_.rc_min_quantizer = 10; + cfg_.rc_max_quantizer = 20; + + init_flags_ = AOM_CODEC_USE_PSNR; + + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 5); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} + AV1_INSTANTIATE_TEST_SUITE(LosslessTestLarge, ::testing::Values(::libaom_test::kOnePassGood, ::libaom_test::kTwoPassGood), - ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ)); + ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ), + ::testing::Values(0)); // cpu_used + +AV1_INSTANTIATE_TEST_SUITE(LosslessAllIntraTestLarge, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::Values(AOM_Q), + ::testing::Values(6, 9)); // cpu_used } // namespace diff --git a/third_party/libaom/source/libaom/test/metadata_test.cc b/third_party/libaom/source/libaom/test/metadata_test.cc index fd3d5c4932..b7b7f14f42 100644 --- a/third_party/libaom/source/libaom/test/metadata_test.cc +++ b/third_party/libaom/source/libaom/test/metadata_test.cc @@ -34,7 +34,7 @@ const size_t kMetadataPayloadSizeCll = 4; const uint8_t kMetadataPayloadCll[kMetadataPayloadSizeCll] = { 0xB5, 0x01, 0x02, 0x03 }; -#if CONFIG_AV1_ENCODER +#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY const size_t kMetadataObuSizeT35 = 28; const uint8_t kMetadataObuT35[kMetadataObuSizeT35] = { @@ -193,7 +193,7 @@ TEST_P(MetadataEncodeTest, TestMetadataEncoding) { AV1_INSTANTIATE_TEST_SUITE(MetadataEncodeTest, ::testing::Values(::libaom_test::kOnePassGood)); -#endif // CONFIG_AV1_ENCODER +#endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY } // namespace TEST(MetadataTest, MetadataAllocation) { diff --git a/third_party/libaom/source/libaom/test/monochrome_test.cc b/third_party/libaom/source/libaom/test/monochrome_test.cc index 6395c22caf..a71cc9b3df 100644 --- a/third_party/libaom/source/libaom/test/monochrome_test.cc +++ b/third_party/libaom/source/libaom/test/monochrome_test.cc @@ -20,16 +20,45 @@ namespace { +const unsigned int kCqLevel = 18; +const double kMaxPsnr = 100.0; + +// kPsnrThreshold represents the psnr threshold used to validate the quality of +// the first frame. The indices, 0 and 1 correspond to non-allintra and allintra +// encoding modes. +const double kPsnrThreshold[2] = { 29.0, 41.5 }; + +// kPsnrFluctuation represents the maximum allowed psnr fluctuation w.r.t first +// frame. The indices, 0 and 1 correspond to non-allintra and allintra encoding +// modes. +const double kPsnrFluctuation[2] = { 2.5, 0.3 }; + class MonochromeTest - : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>, + : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int, + int>, public ::libaom_test::EncoderTest { protected: - MonochromeTest() : EncoderTest(GET_PARAM(0)), frame0_psnr_y_(0.) {} + MonochromeTest() + : EncoderTest(GET_PARAM(0)), lossless_(GET_PARAM(2)), + frame0_psnr_y_(0.0) {} virtual ~MonochromeTest() {} virtual void SetUp() { InitializeConfig(GET_PARAM(1)); } + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, GET_PARAM(3)); + if (mode_ == ::libaom_test::kAllIntra) { + encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel); + } + if (lossless_) { + encoder->Control(AV1E_SET_LOSSLESS, 1); + } + } + } + virtual void DecompressedFrameHook(const aom_image_t &img, aom_codec_pts_t pts) { (void)pts; @@ -68,15 +97,23 @@ class MonochromeTest } virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + // Check average PSNR value is >= 100 db in case of lossless encoding. + if (lossless_) { + EXPECT_GE(pkt->data.psnr.psnr[0], kMaxPsnr); + return; + } + const bool is_allintra = (mode_ == ::libaom_test::kAllIntra); // Check that the initial Y PSNR value is 'high enough', and check that // subsequent Y PSNR values are 'close' to this initial value. - if (frame0_psnr_y_ == 0.) { + if (frame0_psnr_y_ == 0.0) { frame0_psnr_y_ = pkt->data.psnr.psnr[1]; - EXPECT_GT(frame0_psnr_y_, 29.); + EXPECT_GT(frame0_psnr_y_, kPsnrThreshold[is_allintra]); } - EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, 2.5); + EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, + kPsnrFluctuation[is_allintra]); } + int lossless_; std::vector<int> chroma_value_list_; double frame0_psnr_y_; }; @@ -87,9 +124,6 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) { init_flags_ = AOM_CODEC_USE_PSNR; - cfg_.g_w = 352; - cfg_.g_h = 288; - cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 600; cfg_.rc_buf_sz = 1000; @@ -98,13 +132,10 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) { cfg_.rc_undershoot_pct = 50; cfg_.rc_overshoot_pct = 50; cfg_.rc_end_usage = AOM_CBR; - cfg_.kf_mode = AOM_KF_AUTO; cfg_.g_lag_in_frames = 1; cfg_.kf_min_dist = cfg_.kf_max_dist = 3000; // Enable dropped frames. cfg_.rc_dropframe_thresh = 1; - // Disable error_resilience mode. - cfg_.g_error_resilient = 0; // Run at low bitrate. cfg_.rc_target_bitrate = 40; // Set monochrome encoding flag @@ -121,8 +152,33 @@ TEST_P(MonochromeTest, TestMonochromeEncoding) { } } +class MonochromeAllIntraTest : public MonochromeTest {}; + +TEST_P(MonochromeAllIntraTest, TestMonochromeEncoding) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 5); + init_flags_ = AOM_CODEC_USE_PSNR; + // Set monochrome encoding flag + cfg_.monochrome = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Check that the chroma planes are equal across all frames + std::vector<int>::const_iterator iter = chroma_value_list_.begin(); + int initial_chroma_value = *iter; + for (; iter != chroma_value_list_.end(); ++iter) { + // Check that all decoded frames have the same constant chroma planes. + EXPECT_EQ(*iter, initial_chroma_value); + } +} + AV1_INSTANTIATE_TEST_SUITE(MonochromeTest, ::testing::Values(::libaom_test::kOnePassGood, - ::libaom_test::kTwoPassGood)); - + ::libaom_test::kTwoPassGood), + ::testing::Values(0), // lossless + ::testing::Values(0)); // cpu_used + +AV1_INSTANTIATE_TEST_SUITE(MonochromeAllIntraTest, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::Values(0, 1), // lossless + ::testing::Values(6, 9)); // cpu_used } // namespace diff --git a/third_party/libaom/source/libaom/test/noise_model_test.cc b/third_party/libaom/source/libaom/test/noise_model_test.cc index aad8905a45..c12c080cac 100644 --- a/third_party/libaom/source/libaom/test/noise_model_test.cc +++ b/third_party/libaom/source/libaom/test/noise_model_test.cc @@ -212,6 +212,12 @@ TEST(NoiseStrengthSolver, SimplifiesCurve) { aom_noise_strength_solver_free(&solver); } +TEST(NoiseStrengthLut, LutInitNegativeOrZeroSize) { + aom_noise_strength_lut_t lut; + ASSERT_FALSE(aom_noise_strength_lut_init(&lut, -1)); + ASSERT_FALSE(aom_noise_strength_lut_init(&lut, 0)); +} + TEST(NoiseStrengthLut, LutEvalSinglePoint) { aom_noise_strength_lut_t lut; ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 1)); diff --git a/third_party/libaom/source/libaom/test/quant_test.cc b/third_party/libaom/source/libaom/test/quant_test.cc index 9fca953922..a042af13eb 100644 --- a/third_party/libaom/source/libaom/test/quant_test.cc +++ b/third_party/libaom/source/libaom/test/quant_test.cc @@ -20,6 +20,13 @@ namespace { +const ::libaom_test::TestMode kTestMode[] = +#if CONFIG_REALTIME_ONLY + { ::libaom_test::kRealTime }; +#else + { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood }; +#endif + class QMTest : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>, public ::libaom_test::EncoderTest { @@ -41,6 +48,11 @@ class QMTest encoder->Control(AV1E_SET_QM_MAX, qm_max_); encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100); + if (mode_ == ::libaom_test::kRealTime) { + encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0); + encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0); + encoder->Control(AV1E_SET_ENABLE_OBMC, 0); + } } } @@ -75,11 +87,10 @@ TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); } // encodes and decodes without a mismatch. TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); } -AV1_INSTANTIATE_TEST_SUITE(QMTest, - ::testing::Values(::libaom_test::kRealTime, - ::libaom_test::kOnePassGood), +AV1_INSTANTIATE_TEST_SUITE(QMTest, ::testing::ValuesIn(kTestMode), ::testing::Range(5, 9)); +#if !CONFIG_REALTIME_ONLY typedef struct { const unsigned int min_q; const unsigned int max_q; @@ -173,4 +184,5 @@ AV1_INSTANTIATE_TEST_SUITE(QuantizerBoundsCheckTestLarge, ::libaom_test::kTwoPassGood), ::testing::ValuesIn(QuantTestParams), ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ)); +#endif // !CONFIG_REALTIME_ONLY } // namespace diff --git a/third_party/libaom/source/libaom/test/quantize_func_test.cc b/third_party/libaom/source/libaom/test/quantize_func_test.cc index 3d79cf8bd8..3523050844 100644 --- a/third_party/libaom/source/libaom/test/quantize_func_test.cc +++ b/third_party/libaom/source/libaom/test/quantize_func_test.cc @@ -589,4 +589,5 @@ INSTANTIATE_TEST_SUITE_P( static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8))); #endif // HAVE_AVX + } // namespace diff --git a/third_party/libaom/source/libaom/test/rd_test.cc b/third_party/libaom/source/libaom/test/rd_test.cc new file mode 100644 index 0000000000..0c481fcbb6 --- /dev/null +++ b/third_party/libaom/source/libaom/test/rd_test.cc @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <math.h> +#include <vector> + +#include "av1/common/quant_common.h" +#include "av1/encoder/rd.h" +#include "aom/aom_codec.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +TEST(RdTest, GetDeltaqOffsetValueTest1) { + aom_bit_depth_t bit_depth = AOM_BITS_8; + double beta = 4; + int q_index = 29; + int dc_q_step = + av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth)); + EXPECT_EQ(dc_q_step, 32); + + int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta))); + EXPECT_EQ(ref_new_dc_q_step, 16); + + int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta); + int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q, + static_cast<aom_bit_depth_t>(bit_depth)); + + EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step); +} + +TEST(RdTest, GetDeltaqOffsetValueTest2) { + aom_bit_depth_t bit_depth = AOM_BITS_8; + double beta = 1.0 / 4.0; + int q_index = 29; + int dc_q_step = + av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth)); + EXPECT_EQ(dc_q_step, 32); + + int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta))); + EXPECT_EQ(ref_new_dc_q_step, 64); + + int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta); + int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q, + static_cast<aom_bit_depth_t>(bit_depth)); + + EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step); +} + +TEST(RdTest, GetDeltaqOffsetBoundaryTest1) { + aom_bit_depth_t bit_depth = AOM_BITS_8; + double beta = 0.000000001; + std::vector<int> q_index_ls = { 254, 255 }; + for (auto q_index : q_index_ls) { + int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta); + EXPECT_EQ(q_index + delta_q, 255); + } +} + +TEST(RdTest, GetDeltaqOffsetBoundaryTest2) { + aom_bit_depth_t bit_depth = AOM_BITS_8; + double beta = 100; + std::vector<int> q_index_ls = { 1, 0 }; + for (auto q_index : q_index_ls) { + int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta); + EXPECT_EQ(q_index + delta_q, 0); + } +} + +TEST(RdTest, GetDeltaqOffsetUnitaryTest1) { + aom_bit_depth_t bit_depth = AOM_BITS_8; + double beta = 1; + for (int q_index = 0; q_index < 255; ++q_index) { + int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta); + EXPECT_EQ(delta_q, 0); + } +} + +} // namespace diff --git a/third_party/libaom/source/libaom/test/resize_test.cc b/third_party/libaom/source/libaom/test/resize_test.cc index cb09a9a193..68d610151d 100644 --- a/third_party/libaom/source/libaom/test/resize_test.cc +++ b/third_party/libaom/source/libaom/test/resize_test.cc @@ -203,6 +203,17 @@ class ResizeTest virtual void SetUp() { InitializeConfig(GET_PARAM(1)); } + virtual void PreEncodeFrameHook(libaom_test::VideoSource *video, + libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + if (GET_PARAM(1) == ::libaom_test::kRealTime) { + encoder->Control(AV1E_SET_AQ_MODE, 3); + encoder->Control(AOME_SET_CPUUSED, 5); + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + } + } + } + virtual void DecompressedFrameHook(const aom_image_t &img, aom_codec_pts_t pts) { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); @@ -241,6 +252,7 @@ TEST_P(ResizeTest, TestExternalResizeWorks) { const unsigned int kStepDownFrame = 3; const unsigned int kStepUpFrame = 6; +#if !CONFIG_REALTIME_ONLY class ResizeInternalTestLarge : public ResizeTest { protected: #if WRITE_COMPRESSED_STREAM @@ -362,6 +374,10 @@ TEST_P(ResizeInternalTestLarge, TestInternalResizeChangeConfig) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } +AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge, + ::testing::Values(::libaom_test::kOnePassGood)); +#endif + class ResizeRealtimeTest : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>, public ::libaom_test::EncoderTest { @@ -375,6 +391,9 @@ class ResizeRealtimeTest libaom_test::Encoder *encoder) { if (video->frame() == 0) { encoder->Control(AV1E_SET_AQ_MODE, 3); + encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0); + encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0); + encoder->Control(AV1E_SET_ENABLE_OBMC, 0); encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); } @@ -786,6 +805,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) { } } +#if !CONFIG_REALTIME_ONLY // This class is used to check if there are any fatal // failures while encoding with resize-mode > 0 class ResizeModeTestLarge @@ -833,16 +853,6 @@ TEST_P(ResizeModeTestLarge, ResizeModeTest) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -AV1_INSTANTIATE_TEST_SUITE(ResizeTest, - ::testing::Values(::libaom_test::kRealTime)); -AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge, - ::testing::Values(::libaom_test::kOnePassGood)); -AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest, - ::testing::Values(::libaom_test::kRealTime), - ::testing::Range(5, 10)); -AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest, - ::testing::Values(::libaom_test::kRealTime)); - // TODO(anyone): Enable below test once resize issues are fixed GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ResizeModeTestLarge); // AV1_INSTANTIATE_TEST_SUITE( @@ -851,4 +861,14 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ResizeModeTestLarge); // ::libaom_test::kTwoPassGood), // ::testing::Values(1, 2), ::testing::Values(8, 12, 16), // ::testing::Values(8, 12, 16), ::testing::Range(2, 7)); +#endif // !CONFIG_REALTIME_ONLY + +AV1_INSTANTIATE_TEST_SUITE(ResizeTest, + ::testing::Values(::libaom_test::kRealTime)); +AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest, + ::testing::Values(::libaom_test::kRealTime), + ::testing::Range(6, 10)); +AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest, + ::testing::Values(::libaom_test::kRealTime)); + } // namespace diff --git a/third_party/libaom/source/libaom/test/rt_end_to_end_test.cc b/third_party/libaom/source/libaom/test/rt_end_to_end_test.cc index e8a1a40d87..6d3704dbfc 100644 --- a/third_party/libaom/source/libaom/test/rt_end_to_end_test.cc +++ b/third_party/libaom/source/libaom/test/rt_end_to_end_test.cc @@ -42,9 +42,9 @@ std::unordered_map<std::string, { 6, { { 0, 36.1 }, { 3, 36.5 } } }, { 7, { { 0, 35.5 }, { 3, 36.0 } } }, { 8, { { 0, 36.0 }, { 3, 36.5 } } }, - { 9, { { 0, 35.5 }, { 3, 36.1 } } } } }, + { 9, { { 0, 35.5 }, { 3, 36.0 } } } } }, { "niklas_1280_720_30.y4m", - { { 5, { { 0, 34.4 }, { 3, 34.4 } } }, + { { 5, { { 0, 34.4 }, { 3, 34.32 } } }, { 6, { { 0, 34.2 }, { 3, 34.2 } } }, { 7, { { 0, 33.6 }, { 3, 33.6 } } }, { 8, { { 0, 33.48 }, { 3, 33.48 } } }, @@ -125,6 +125,7 @@ class RTEndToEndTest encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2); encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2); encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2); + encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2); } } diff --git a/third_party/libaom/source/libaom/test/sad_test.cc b/third_party/libaom/source/libaom/test/sad_test.cc index afd84a8ad2..037ed2455f 100644 --- a/third_party/libaom/source/libaom/test/sad_test.cc +++ b/third_party/libaom/source/libaom/test/sad_test.cc @@ -564,8 +564,8 @@ class DistWtdCompAvgTest void CheckCompAvg() { for (int j = 0; j < 2; ++j) { for (int i = 0; i < 4; ++i) { - jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0]; - jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1]; + jcp_param_.fwd_offset = quant_dist_lookup_table[i][j]; + jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j]; ReferenceDistWtdCompAvg(0); dist_wtd_comp_avg(0); @@ -632,8 +632,8 @@ class DistWtdSADavgTest void CheckSAD() { for (int j = 0; j < 2; ++j) { for (int i = 0; i < 4; ++i) { - jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0]; - jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1]; + jcp_param_.fwd_offset = quant_dist_lookup_table[i][j]; + jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j]; const unsigned int reference_sad = ReferenceDistWtdSADavg(0); const unsigned int exp_sad = dist_wtd_SAD_avg(0); @@ -705,9 +705,7 @@ TEST_P(SADTest, ShortSrc) { source_stride_ = tmp_stride; } -#define SPEED_TEST (0) -#if SPEED_TEST -TEST_P(SADTest, Speed) { +TEST_P(SADTest, DISABLED_Speed) { const int tmp_stride = source_stride_; source_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -715,7 +713,6 @@ TEST_P(SADTest, Speed) { SpeedSAD(); source_stride_ = tmp_stride; } -#endif TEST_P(SADSkipTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); @@ -762,8 +759,7 @@ TEST_P(SADSkipTest, ShortSrc) { source_stride_ = tmp_stride; } -#if SPEED_TEST -TEST_P(SADSkipTest, Speed) { +TEST_P(SADSkipTest, DISABLED_Speed) { const int tmp_stride = source_stride_; source_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -771,7 +767,6 @@ TEST_P(SADSkipTest, Speed) { SpeedSAD(); source_stride_ = tmp_stride; } -#endif TEST_P(SADavgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); @@ -1020,8 +1015,7 @@ TEST_P(SADx4Test, SrcAlignedByWidth) { source_data_ = tmp_source_data; } -#if SPEED_TEST -TEST_P(SADx4Test, Speed) { +TEST_P(SADx4Test, DISABLED_Speed) { FillRandom(source_data_, source_stride_); FillRandom(GetReference(0), reference_stride_); FillRandom(GetReference(1), reference_stride_); @@ -1029,7 +1023,6 @@ TEST_P(SADx4Test, Speed) { FillRandom(GetReference(3), reference_stride_); SpeedSAD(); } -#endif // SADSkipx4 TEST_P(SADSkipx4Test, MaxRef) { @@ -1104,8 +1097,7 @@ TEST_P(SADSkipx4Test, SrcAlignedByWidth) { source_data_ = tmp_source_data; } -#if SPEED_TEST -TEST_P(SADSkipx4Test, Speed) { +TEST_P(SADSkipx4Test, DISABLED_Speed) { FillRandom(source_data_, source_stride_); FillRandom(GetReference(0), reference_stride_); FillRandom(GetReference(1), reference_stride_); @@ -1113,12 +1105,10 @@ TEST_P(SADSkipx4Test, Speed) { FillRandom(GetReference(3), reference_stride_); SpeedSAD(); } -#endif using std::make_tuple; -#if SPEED_TEST -TEST_P(SADx4AvgTest, Speed) { +TEST_P(SADx4AvgTest, DISABLED_Speed) { int tmp_stride = reference_stride_; reference_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -1130,7 +1120,6 @@ TEST_P(SADx4AvgTest, Speed) { SpeedSAD(); reference_stride_ = tmp_stride; } -#endif TEST_P(SADx4AvgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); diff --git a/third_party/libaom/source/libaom/test/sharpness_test.cc b/third_party/libaom/source/libaom/test/sharpness_test.cc new file mode 100644 index 0000000000..e74609bd9d --- /dev/null +++ b/third_party/libaom/source/libaom/test/sharpness_test.cc @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <unordered_map> + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" + +namespace { +const unsigned int kFrames = 10; +const int kBitrate = 500; +const unsigned int kCqLevel = 18; + +// List of psnr thresholds for different test combinations +// keys: test-mode, cpu-used, sharpness. +const std::unordered_map< + int, std::unordered_map<int, std::unordered_map<int, double>>> + kPsnrThreshold = { { static_cast<int>(::libaom_test::kTwoPassGood), + { { 2, { { 2, 37.6 }, { 5, 37.6 } } }, + { 4, { { 2, 37.5 }, { 5, 37.5 } } }, + { 6, { { 2, 37.5 }, { 5, 37.5 } } } } }, + { static_cast<int>(::libaom_test::kAllIntra), + { { 3, { { 2, 42.3 }, { 5, 42.4 } } }, + { 6, { { 2, 41.8 }, { 4, 41.9 }, { 5, 41.9 } } }, + { 9, { { 2, 41.4 }, { 5, 41.4 } } } } } }; + +// This class is used to test sharpness parameter configured through control +// call using AOME_SET_SHARPNESS for different encoder configurations. +class SharpnessTest + : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int, + int>, + public ::libaom_test::EncoderTest { + protected: + SharpnessTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), sharpness_level_(GET_PARAM(3)), psnr_(0.0), + nframes_(0) {} + + ~SharpnessTest() override {} + + void SetUp() override { + InitializeConfig(encoding_mode_); + if (encoding_mode_ == ::libaom_test::kTwoPassGood) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_lag_in_frames = 5; + } + } + + void BeginPassHook(unsigned int) override { + psnr_ = 0.0; + nframes_ = 0; + } + + void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AOME_SET_SHARPNESS, sharpness_level_); + if (encoding_mode_ == ::libaom_test::kTwoPassGood) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } else if (encoding_mode_ == ::libaom_test::kAllIntra) { + encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel); + } + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { + return kPsnrThreshold.at(encoding_mode_).at(cpu_used_).at(sharpness_level_); + } + + void DoTest() { + init_flags_ = AOM_CODEC_USE_PSNR; + + std::unique_ptr<libaom_test::VideoSource> video( + new libaom_test::Y4mVideoSource("paris_352_288_30.y4m", 0, kFrames)); + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()) + << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_ + << ", sharpness level = " << sharpness_level_; + } + + private: + const libaom_test::TestMode encoding_mode_; + const int cpu_used_; + const int sharpness_level_; + double psnr_; + unsigned int nframes_; +}; + +class SharpnessTestLarge : public SharpnessTest {}; + +class SharpnessAllIntraTest : public SharpnessTest {}; + +class SharpnessAllIntraTestLarge : public SharpnessTest {}; + +TEST_P(SharpnessTestLarge, SharpnessPSNRTest) { DoTest(); } + +TEST_P(SharpnessAllIntraTest, SharpnessPSNRTest) { DoTest(); } + +TEST_P(SharpnessAllIntraTestLarge, SharpnessPSNRTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_SUITE(SharpnessTestLarge, + ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::Values(2, 4, 6), // cpu_used + ::testing::Values(2, 5)); // sharpness level + +AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTest, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::Values(6), // cpu_used + ::testing::Values(4)); // sharpness level + +AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTestLarge, + ::testing::Values(::libaom_test::kAllIntra), + ::testing::Values(3, 6, 9), // cpu_used + ::testing::Values(2, 5)); // sharpness level +} // namespace diff --git a/third_party/libaom/source/libaom/test/svc_datarate_test.cc b/third_party/libaom/source/libaom/test/svc_datarate_test.cc index 8d7376a554..d2839ccc61 100644 --- a/third_party/libaom/source/libaom/test/svc_datarate_test.cc +++ b/third_party/libaom/source/libaom/test/svc_datarate_test.cc @@ -80,6 +80,7 @@ class DatarateTestSVC mismatch_psnr_ = 0.0; set_frame_level_er_ = 0; multi_ref_ = 0; + use_fixed_mode_svc_ = 0; } virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, @@ -89,6 +90,7 @@ class DatarateTestSVC initialize_svc(number_temporal_layers_, number_spatial_layers_, &svc_params_); encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_); + // TODO(aomedia:3032): Configure KSVC in fixed mode. encoder->Control(AV1E_SET_ENABLE_ORDER_HINT, 0); encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0); encoder->Control(AV1E_SET_DELTAQ_MODE, 0); @@ -110,7 +112,11 @@ class DatarateTestSVC set_layer_pattern(video->frame(), &layer_id_, &ref_frame_config_, spatial_layer_id, multi_ref_); encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_); - encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + // The SET_SVC_REF_FRAME_CONFIG api is for the flexible SVC mode + // (i.e., use_fixed_mode_svc == 0). + if (!use_fixed_mode_svc_) { + encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + } if (set_frame_level_er_) { int mode = (layer_id_.spatial_layer_id > 0 || layer_id_.temporal_layer_id > 0); @@ -170,7 +176,7 @@ class DatarateTestSVC int lag_index = 0; int base_count = frame_cnt >> 2; layer_id->spatial_layer_id = spatial_layer; - // Set the referende map buffer idx for the 7 references: + // Set the reference map buffer idx for the 7 references: // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3), // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6). for (int i = 0; i < INTER_REFS_PER_FRAME; i++) { @@ -689,6 +695,48 @@ class DatarateTestSVC } } + virtual void BasicRateTargetingFixedModeSVC3TL3SLHDTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = AOM_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + + ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + const int bitrate_array[2] = { 600, 1200 }; + cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)]; + ResetModel(); + number_temporal_layers_ = 3; + number_spatial_layers_ = 3; + use_fixed_mode_svc_ = 1; + // SL0 + const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100; + target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100; + target_layer_bitrate_[2] = bitrate_sl0; + // SL1 + const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100; + target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100; + target_layer_bitrate_[5] = bitrate_sl1; + // SL2 + const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8; + target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100; + target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100; + target_layer_bitrate_[8] = bitrate_sl2; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) { + ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45) + << " The datarate for the file is greater than target by too much!"; + } + } + virtual void BasicRateTargetingSVC3TL3SLHDMT2Test() { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; @@ -1101,6 +1149,7 @@ class DatarateTestSVC double mismatch_psnr_; int set_frame_level_er_; int multi_ref_; + int use_fixed_mode_svc_; }; // Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial. @@ -1142,6 +1191,12 @@ TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHD) { } // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers, +// for fixed mode SVC. +TEST_P(DatarateTestSVC, BasicRateTargetingFixedModeSVC3TL3SLHD) { + BasicRateTargetingFixedModeSVC3TL3SLHDTest(); +} + +// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers, // for 2 threads, 2 tile_columns, row-mt enabled. TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMT2) { BasicRateTargetingSVC3TL3SLHDMT2Test(); diff --git a/third_party/libaom/source/libaom/test/tile_config_test.cc b/third_party/libaom/source/libaom/test/tile_config_test.cc index 0098903aa8..517d54bd94 100644 --- a/third_party/libaom/source/libaom/test/tile_config_test.cc +++ b/third_party/libaom/source/libaom/test/tile_config_test.cc @@ -28,6 +28,14 @@ typedef struct { const unsigned int tile_cols; } uniformTileConfigParam; +const libaom_test::TestMode kTestModeParams[] = +#if CONFIG_REALTIME_ONLY + { ::libaom_test::kRealTime }; +#else + { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood, + ::libaom_test::kTwoPassGood }; +#endif + static const uniformTileConfigParam uniformTileConfigParams[] = { { 128, 0, 0 }, { 128, 0, 2 }, { 128, 2, 0 }, { 128, 1, 2 }, { 128, 2, 2 }, { 128, 3, 2 }, { 64, 0, 0 }, { 64, 0, 2 }, { 64, 2, 0 }, { 64, 1, 2 }, @@ -254,14 +262,12 @@ TEST_P(NonUniformTileConfigTestLarge, NonUniformTileConfigTest) { } AV1_INSTANTIATE_TEST_SUITE(UniformTileConfigTestLarge, - ::testing::Values(::libaom_test::kOnePassGood, - ::libaom_test::kTwoPassGood), + ::testing::ValuesIn(kTestModeParams), ::testing::ValuesIn(uniformTileConfigParams), ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ)); AV1_INSTANTIATE_TEST_SUITE(NonUniformTileConfigTestLarge, - ::testing::Values(::libaom_test::kOnePassGood, - ::libaom_test::kTwoPassGood), + ::testing::ValuesIn(kTestModeParams), ::testing::ValuesIn(nonUniformTileConfigParams), ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ)); @@ -352,7 +358,6 @@ TEST_P(TileGroupTestLarge, TileGroupCountTest) { } AV1_INSTANTIATE_TEST_SUITE(TileGroupTestLarge, - ::testing::Values(::libaom_test::kOnePassGood, - ::libaom_test::kTwoPassGood), + ::testing::ValuesIn(kTestModeParams), ::testing::ValuesIn(tileGroupTestParams)); } // namespace diff --git a/third_party/libaom/source/libaom/test/time_stamp_test.cc b/third_party/libaom/source/libaom/test/time_stamp_test.cc index 205e5ba5bd..baa0dc06db 100644 --- a/third_party/libaom/source/libaom/test/time_stamp_test.cc +++ b/third_party/libaom/source/libaom/test/time_stamp_test.cc @@ -95,8 +95,13 @@ TEST_P(TimestampTest, TestAv1Rollover) { video.set_starting_pts(922337170351ll); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } - +#if CONFIG_REALTIME_ONLY +AV1_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libaom_test::kRealTime)); +#else AV1_INSTANTIATE_TEST_SUITE(TimestampTest, - ::testing::Values(::libaom_test::kTwoPassGood)); + ::testing::Values(::libaom_test::kRealTime, + ::libaom_test::kTwoPassGood)); +#endif } // namespace diff --git a/third_party/libaom/source/libaom/test/tpl_model_test.cc b/third_party/libaom/source/libaom/test/tpl_model_test.cc new file mode 100644 index 0000000000..83845ee6d7 --- /dev/null +++ b/third_party/libaom/source/libaom/test/tpl_model_test.cc @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <cstdlib> +#include <vector> + +#include "av1/encoder/cost.h" +#include "av1/encoder/tpl_model.h" +#include "av1/encoder/encoder.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace { + +double laplace_prob(double q_step, double b, double zero_bin_ratio, + int qcoeff) { + int abs_qcoeff = abs(qcoeff); + double z0 = fmax(exp(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON); + if (abs_qcoeff == 0) { + double p0 = 1 - z0; + return p0; + } else { + assert(abs_qcoeff > 0); + double z = fmax(exp(-q_step / b), TPL_EPSILON); + double p = z0 / 2 * (1 - z) * pow(z, abs_qcoeff - 1); + return p; + } +} +TEST(TplModelTest, ExponentialEntropyBoundaryTest1) { + double b = 0; + double q_step = 1; + double entropy = av1_exponential_entropy(q_step, b); + EXPECT_NEAR(entropy, 0, 0.00001); +} + +TEST(TplModelTest, TransformCoeffEntropyTest1) { + // Check the consistency between av1_estimate_coeff_entropy() and + // laplace_prob() + double b = 1; + double q_step = 1; + double zero_bin_ratio = 2; + for (int qcoeff = -256; qcoeff < 256; ++qcoeff) { + double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff); + double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff); + double ref_rate = -log2(prob); + EXPECT_DOUBLE_EQ(rate, ref_rate); + } +} + +TEST(TplModelTest, TransformCoeffEntropyTest2) { + // Check the consistency between av1_estimate_coeff_entropy(), laplace_prob() + // and av1_laplace_entropy() + double b = 1; + double q_step = 1; + double zero_bin_ratio = 2; + double est_expected_rate = 0; + for (int qcoeff = -20; qcoeff < 20; ++qcoeff) { + double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff); + double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff); + est_expected_rate += prob * rate; + } + double expected_rate = av1_laplace_entropy(q_step, b, zero_bin_ratio); + EXPECT_NEAR(expected_rate, est_expected_rate, 0.001); +} + +TEST(TplModelTest, DeltaRateCostZeroFlow) { + // When srcrf_dist equal to recrf_dist, av1_delta_rate_cost should return 0 + int64_t srcrf_dist = 256; + int64_t recrf_dist = 256; + int64_t delta_rate = 512; + int pixel_num = 256; + int64_t rate_cost = + av1_delta_rate_cost(delta_rate, recrf_dist, srcrf_dist, pixel_num); + EXPECT_EQ(rate_cost, 0); +} + +// a reference function of av1_delta_rate_cost() with delta_rate using bit as +// basic unit +double ref_delta_rate_cost(int64_t delta_rate, double src_rec_ratio, + int pixel_count) { + assert(src_rec_ratio <= 1 && src_rec_ratio >= 0); + double bits_per_pixel = (double)delta_rate / pixel_count; + double p = pow(2, bits_per_pixel); + double flow_rate_per_pixel = + sqrt(p * p / (src_rec_ratio * p * p + (1 - src_rec_ratio))); + double rate_cost = pixel_count * log2(flow_rate_per_pixel); + return rate_cost; +} + +TEST(TplModelTest, DeltaRateCostReference) { + const int64_t scale = TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT; + std::vector<int64_t> srcrf_dist_arr = { 256, 257, 312 }; + std::vector<int64_t> recrf_dist_arr = { 512, 288, 620 }; + std::vector<int64_t> delta_rate_arr = { 10, 278, 100 }; + for (size_t t = 0; t < srcrf_dist_arr.size(); ++t) { + int64_t srcrf_dist = srcrf_dist_arr[t]; + int64_t recrf_dist = recrf_dist_arr[t]; + int64_t delta_rate = delta_rate_arr[t]; + int64_t scaled_delta_rate = delta_rate << scale; + int pixel_count = 256; + int64_t rate_cost = av1_delta_rate_cost(scaled_delta_rate, recrf_dist, + srcrf_dist, pixel_count); + rate_cost >>= scale; + double src_rec_ratio = (double)srcrf_dist / recrf_dist; + double ref_rate_cost = + ref_delta_rate_cost(delta_rate, src_rec_ratio, pixel_count); + EXPECT_NEAR((double)rate_cost, ref_rate_cost, 1); + } +} + +TEST(TplModelTest, GetOverlapAreaHasOverlap) { + // The block a's area is [10, 17) x [18, 24). + // The block b's area is [8, 15) x [17, 23). + // The overlapping area between block a and block b is [10, 15) x [18, 23). + // Therefore, the size of the area is (15 - 10) * (23 - 18) = 25. + int row_a = 10; + int col_a = 18; + int row_b = 8; + int col_b = 17; + int height = 7; + int width = 6; + int overlap_area = + av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height); + EXPECT_EQ(overlap_area, 25); +} + +TEST(TplModelTest, GetOverlapAreaNoOverlap) { + // The block a's area is [10, 14) x [18, 22). + // The block b's area is [5, 9) x [5, 9). + // Threre is no overlapping area between block a and block b. + // Therefore, the return value should be zero. + int row_a = 10; + int col_a = 18; + int row_b = 5; + int col_b = 5; + int height = 4; + int width = 4; + int overlap_area = + av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height); + EXPECT_EQ(overlap_area, 0); +} + +TEST(TPLModelTest, EstimateFrameRateTest) { + /* + * Transform size: 16x16 + * Frame count: 16 + * Transform block count: 20 + */ + const int txfm_size = 256; // 16x16 + const int frame_count = 16; + unsigned char q_index_list[16]; + TplTxfmStats stats_list[16]; + + for (int i = 0; i < frame_count; i++) { + q_index_list[i] = 1; + stats_list[i].txfm_block_count = 8; + + for (int j = 0; j < txfm_size; j++) { + stats_list[i].abs_coeff_sum[j] = 0; + } + } + + double result = + av1_estimate_gop_bitrate(q_index_list, frame_count, stats_list); + EXPECT_NEAR(result, 0, 0.1); +} + +TEST(TPLModelTest, TxfmStatsInitTest) { + TplTxfmStats tpl_txfm_stats; + av1_init_tpl_txfm_stats(&tpl_txfm_stats); + EXPECT_EQ(tpl_txfm_stats.coeff_num, 256); + EXPECT_EQ(tpl_txfm_stats.txfm_block_count, 0); + for (int i = 0; i < tpl_txfm_stats.coeff_num; ++i) { + EXPECT_DOUBLE_EQ(tpl_txfm_stats.abs_coeff_sum[i], 0); + } +} + +TEST(TPLModelTest, TxfmStatsAccumulateTest) { + TplTxfmStats sub_stats; + av1_init_tpl_txfm_stats(&sub_stats); + sub_stats.txfm_block_count = 17; + for (int i = 0; i < sub_stats.coeff_num; ++i) { + sub_stats.abs_coeff_sum[i] = i; + } + + TplTxfmStats accumulated_stats; + av1_init_tpl_txfm_stats(&accumulated_stats); + accumulated_stats.txfm_block_count = 13; + for (int i = 0; i < accumulated_stats.coeff_num; ++i) { + accumulated_stats.abs_coeff_sum[i] = 5 * i; + } + + av1_accumulate_tpl_txfm_stats(&sub_stats, &accumulated_stats); + EXPECT_DOUBLE_EQ(accumulated_stats.txfm_block_count, 30); + for (int i = 0; i < accumulated_stats.coeff_num; ++i) { + EXPECT_DOUBLE_EQ(accumulated_stats.abs_coeff_sum[i], 6 * i); + } +} + +TEST(TPLModelTest, TxfmStatsRecordTest) { + TplTxfmStats stats1; + TplTxfmStats stats2; + av1_init_tpl_txfm_stats(&stats1); + av1_init_tpl_txfm_stats(&stats2); + + tran_low_t coeff[256]; + for (int i = 0; i < 256; ++i) { + coeff[i] = i; + } + av1_record_tpl_txfm_block(&stats1, coeff); + EXPECT_EQ(stats1.txfm_block_count, 1); + + // we record the same transform block twice for testing purpose + av1_record_tpl_txfm_block(&stats2, coeff); + av1_record_tpl_txfm_block(&stats2, coeff); + EXPECT_EQ(stats2.txfm_block_count, 2); + + EXPECT_EQ(stats1.coeff_num, 256); + EXPECT_EQ(stats2.coeff_num, 256); + for (int i = 0; i < 256; ++i) { + EXPECT_DOUBLE_EQ(stats2.abs_coeff_sum[i], 2 * stats1.abs_coeff_sum[i]); + } +} + +} // namespace diff --git a/third_party/libaom/source/libaom/test/variance_test.cc b/third_party/libaom/source/libaom/test/variance_test.cc index fa90305acd..6bb96ce46f 100644 --- a/third_party/libaom/source/libaom/test/variance_test.cc +++ b/third_party/libaom/source/libaom/test/variance_test.cc @@ -1004,8 +1004,8 @@ void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() { for (int y0 = 0; y0 < 4; ++y0) { uint32_t sse1, sse2; uint32_t var1, var2; - jcp_param_.fwd_offset = quant_dist_lookup_table[x0][y0][0]; - jcp_param_.bck_offset = quant_dist_lookup_table[x0][y0][1]; + jcp_param_.fwd_offset = quant_dist_lookup_table[y0][x0]; + jcp_param_.bck_offset = quant_dist_lookup_table[y0][1 - x0]; ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y, src_, width(), &sse1, sec_, &jcp_param_)); diff --git a/third_party/libaom/source/libaom/test/warp_filter_test_util.cc b/third_party/libaom/source/libaom/test/warp_filter_test_util.cc index 07a2e3f6e6..0e6e8b1324 100644 --- a/third_party/libaom/source/libaom/test/warp_filter_test_util.cc +++ b/third_party/libaom/source/libaom/test/warp_filter_test_util.cc @@ -226,8 +226,8 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) { conv_params.use_dist_wtd_comp_avg = 0; } else { conv_params.use_dist_wtd_comp_avg = 1; - conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + conv_params.fwd_offset = quant_dist_lookup_table[jj][ii]; + conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii]; } av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w, sub_x, sub_y, &conv_params, alpha, @@ -240,8 +240,8 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) { conv_params.use_dist_wtd_comp_avg = 0; } else { conv_params.use_dist_wtd_comp_avg = 1; - conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + conv_params.fwd_offset = quant_dist_lookup_table[jj][ii]; + conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii]; } test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h, out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma, @@ -424,8 +424,8 @@ void AV1HighbdWarpFilterTest::RunCheckOutput( conv_params.use_dist_wtd_comp_avg = 0; } else { conv_params.use_dist_wtd_comp_avg = 1; - conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + conv_params.fwd_offset = quant_dist_lookup_table[jj][ii]; + conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii]; } av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32, @@ -441,8 +441,8 @@ void AV1HighbdWarpFilterTest::RunCheckOutput( conv_params.use_dist_wtd_comp_avg = 0; } else { conv_params.use_dist_wtd_comp_avg = 1; - conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + conv_params.fwd_offset = quant_dist_lookup_table[jj][ii]; + conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii]; } test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h, out_w, sub_x, sub_y, bd, &conv_params, alpha, beta, diff --git a/third_party/libaom/source/libaom/third_party/fastfeat/fast.c b/third_party/libaom/source/libaom/third_party/fastfeat/fast.c index f29ac8f725..30efde8396 100644 --- a/third_party/libaom/source/libaom/third_party/fastfeat/fast.c +++ b/third_party/libaom/source/libaom/third_party/fastfeat/fast.c @@ -1,3 +1,33 @@ +// Copyright (c) 2006, 2008 Edward Rosten +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// *Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// *Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// *Neither the name of the University of Cambridge nor the names of +// its contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // clang-format off #include <stdlib.h> #include "fast.h" diff --git a/third_party/libaom/source/libaom/third_party/fastfeat/fast.h b/third_party/libaom/source/libaom/third_party/fastfeat/fast.h index a65d5a5d17..d7a9617cce 100644 --- a/third_party/libaom/source/libaom/third_party/fastfeat/fast.h +++ b/third_party/libaom/source/libaom/third_party/fastfeat/fast.h @@ -1,3 +1,33 @@ +// Copyright (c) 2006, 2008 Edward Rosten +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// *Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// *Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// *Neither the name of the University of Cambridge nor the names of +// its contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // clang-format off #ifndef FAST_H #define FAST_H diff --git a/third_party/libaom/source/libaom/third_party/fastfeat/fast_9.c b/third_party/libaom/source/libaom/third_party/fastfeat/fast_9.c index 61c654c472..c0fdbe26cd 100644 --- a/third_party/libaom/source/libaom/third_party/fastfeat/fast_9.c +++ b/third_party/libaom/source/libaom/third_party/fastfeat/fast_9.c @@ -1,3 +1,33 @@ +// Copyright (c) 2006, 2008 Edward Rosten +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// *Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// *Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// *Neither the name of the University of Cambridge nor the names of +// its contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // clang-format off /*This is mechanically generated code*/ #include <stdlib.h> diff --git a/third_party/libaom/source/libaom/third_party/fastfeat/nonmax.c b/third_party/libaom/source/libaom/third_party/fastfeat/nonmax.c index 0dbc660cb0..2e048e5460 100644 --- a/third_party/libaom/source/libaom/third_party/fastfeat/nonmax.c +++ b/third_party/libaom/source/libaom/third_party/fastfeat/nonmax.c @@ -1,3 +1,33 @@ +// Copyright (c) 2006, 2008 Edward Rosten +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// *Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// *Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// *Neither the name of the University of Cambridge nor the names of +// its contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // clang-format off #include <stdlib.h> #include "fast.h" diff --git a/third_party/libaom/source/libaom/third_party/vector/vector.c b/third_party/libaom/source/libaom/third_party/vector/vector.c index 4b8b9c6fd9..2295b8f080 100644 --- a/third_party/libaom/source/libaom/third_party/vector/vector.c +++ b/third_party/libaom/source/libaom/third_party/vector/vector.c @@ -3,7 +3,7 @@ The MIT License(MIT) Copyright(c) 2016 Peter Goldsborough Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files(the "Software"), to deal in +this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, diff --git a/third_party/libaom/source/libaom/third_party/vector/vector.h b/third_party/libaom/source/libaom/third_party/vector/vector.h index d09eb64c93..acc70fe099 100644 --- a/third_party/libaom/source/libaom/third_party/vector/vector.h +++ b/third_party/libaom/source/libaom/third_party/vector/vector.h @@ -3,7 +3,7 @@ The MIT License(MIT) Copyright(c) 2016 Peter Goldsborough Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files(the "Software"), to deal in +this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/decl_status_code.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/decl_status_code.c index 4c7afbaae5..bd445ab1b5 100644 --- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/decl_status_code.c +++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/decl_status_code.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + typedef struct S1 { int x; } T1; diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/func_in_out.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/func_in_out.c index 8c14edc109..67ab58d520 100644 --- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/func_in_out.c +++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/func_in_out.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + typedef struct XD { int u; int v; diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/global_variable.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/global_variable.c index 1934e20a75..26d5385e97 100644 --- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/global_variable.c +++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/global_variable.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + extern const int global_a[13]; const int global_b = 0; diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/parse_lvalue.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/parse_lvalue.c index 093ab55ac6..97113efc15 100644 --- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/parse_lvalue.c +++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/parse_lvalue.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + typedef struct RD { int u; int v; diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/simple_code.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/simple_code.c index 330fc3a90c..dd89a15621 100644 --- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/simple_code.c +++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/simple_code.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + typedef struct S { int x; int y; diff --git a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/struct_code.c b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/struct_code.c index 62b9d7adee..e14372c83e 100644 --- a/third_party/libaom/source/libaom/tools/auto_refactor/c_files/struct_code.c +++ b/third_party/libaom/source/libaom/tools/auto_refactor/c_files/struct_code.c @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + typedef struct S1 { int x; } T1; diff --git a/third_party/libyuv/CMakeLists.txt b/third_party/libyuv/CMakeLists.txt index 60d17338ed..94bdfe0af6 100644 --- a/third_party/libyuv/CMakeLists.txt +++ b/third_party/libyuv/CMakeLists.txt @@ -24,3 +24,7 @@ if (NOT WINDOWS_MSVC_X86_64) target_link_libraries(libyuv_unittest ${ly_lib_name} gtest_main Threads::Threads) endif() + +if (LINUX_AARCH64) + target_compile_definitions(${ly_lib_name} PRIVATE LIBYUV_DISABLE_NEON=1) +endif() diff --git a/third_party/libyuv/include/libyuv/compare_row.h b/third_party/libyuv/include/libyuv/compare_row.h index e95b9d93eb..64115b3a3f 100644 --- a/third_party/libyuv/include/libyuv/compare_row.h +++ b/third_party/libyuv/include/libyuv/compare_row.h @@ -55,20 +55,20 @@ extern "C" { // The following are available for Visual C and clangcl 32 bit: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ + !defined(__clang__) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_HASHDJB2_AVX2 #define HAS_SUMSQUAREERROR_AVX2 #endif -// The following are available for GCC and clangcl 64 bit: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +// The following are available for GCC and clangcl: +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define HAS_HAMMINGDISTANCE_SSSE3 #endif -// The following are available for GCC and clangcl 64 bit: +// The following are available for GCC and clangcl: #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) + (defined(__x86_64__) || defined(__i386__)) #define HAS_HAMMINGDISTANCE_AVX2 #endif diff --git a/third_party/libyuv/include/libyuv/convert.h b/third_party/libyuv/include/libyuv/convert.h index 40869ef218..93e7550be8 100644 --- a/third_party/libyuv/include/libyuv/convert.h +++ b/third_party/libyuv/include/libyuv/convert.h @@ -693,6 +693,19 @@ int RAWToI420(const uint8_t* src_raw, int width, int height); +// RGB big endian (rgb in memory) to J420. +LIBYUV_API +int RAWToJ420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + // RGB16 (RGBP fourcc) little endian to I420. LIBYUV_API int RGB565ToI420(const uint8_t* src_rgb565, diff --git a/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/libyuv/include/libyuv/convert_argb.h index 297de15162..eb4ebd54a8 100644 --- a/third_party/libyuv/include/libyuv/convert_argb.h +++ b/third_party/libyuv/include/libyuv/convert_argb.h @@ -54,12 +54,30 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) #define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) +#define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ + I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ + I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ + I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ + I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ + I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ + I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) +#define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ + I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) +#define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ + I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) +#define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ + I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) // Alias. #define ARGBToARGB ARGBCopy @@ -125,32 +143,6 @@ int J420ToABGR(const uint8_t* src_y, int width, int height); -// Convert F420 to ARGB. BT.709 full range -LIBYUV_API -int F420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert F420 to ABGR. BT.709 full range -LIBYUV_API -int F420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - // Convert H420 to ARGB. LIBYUV_API int H420ToARGB(const uint8_t* src_y, @@ -814,29 +806,29 @@ int I010ToAR30(const uint16_t* src_y, int width, int height); -// Convert I010 to AB30. +// Convert H010 to AR30. LIBYUV_API -int I010ToAB30(const uint16_t* src_y, +int H010ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, - uint8_t* dst_ab30, - int dst_stride_ab30, + uint8_t* dst_ar30, + int dst_stride_ar30, int width, int height); -// Convert H010 to AR30. +// Convert I010 to AB30. LIBYUV_API -int H010ToAR30(const uint16_t* src_y, +int I010ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, - uint8_t* dst_ar30, - int dst_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, int width, int height); @@ -1073,6 +1065,42 @@ int AR30ToAB30(const uint8_t* src_ar30, int width, int height); +// Convert AR64 to ARGB. +LIBYUV_API +int AR64ToARGB(const uint16_t* src_ar64, + int src_stride_ar64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert AB64 to ABGR. +#define AB64ToABGR AR64ToARGB + +// Convert AB64 to ARGB. +LIBYUV_API +int AB64ToARGB(const uint16_t* src_ab64, + int src_stride_ab64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert AR64 to ABGR. +#define AR64ToABGR AB64ToARGB + +// Convert AR64 To AB64. +LIBYUV_API +int AR64ToAB64(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height); + +// Convert AB64 To AR64. +#define AB64ToAR64 AR64ToAB64 + // src_width/height provided by capture // dst_width/height for clipping determine final size. LIBYUV_API @@ -1385,6 +1413,19 @@ int I420ToAR30(const uint8_t* src_y, int width, int height); +// Convert I420 to AB30. +LIBYUV_API +int I420ToAB30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); + // Convert H420 to AR30. LIBYUV_API int H420ToAR30(const uint8_t* src_y, @@ -1398,6 +1439,19 @@ int H420ToAR30(const uint8_t* src_y, int width, int height); +// Convert H420 to AB30. +LIBYUV_API +int H420ToAB30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); + // Convert I420 to ARGB with matrix. LIBYUV_API int I420ToARGBMatrix(const uint8_t* src_y, @@ -1440,7 +1494,7 @@ int I444ToARGBMatrix(const uint8_t* src_y, int width, int height); -// multiply 10 bit yuv into high bits to allow any number of bits. +// Convert 10 bit 420 YUV to ARGB with matrix. LIBYUV_API int I010ToAR30Matrix(const uint16_t* src_y, int src_stride_y, @@ -1454,7 +1508,7 @@ int I010ToAR30Matrix(const uint16_t* src_y, int width, int height); -// multiply 10 bit yuv into high bits to allow any number of bits. +// Convert 10 bit 420 YUV to ARGB with matrix. LIBYUV_API int I210ToAR30Matrix(const uint16_t* src_y, int src_stride_y, @@ -1468,6 +1522,20 @@ int I210ToAR30Matrix(const uint16_t* src_y, int width, int height); +// Convert 10 bit 444 YUV to ARGB with matrix. +LIBYUV_API +int I410ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert 10 bit YUV to ARGB with matrix. LIBYUV_API int I010ToARGBMatrix(const uint16_t* src_y, @@ -1482,6 +1550,34 @@ int I010ToARGBMatrix(const uint16_t* src_y, int width, int height); +// multiply 12 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I012ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert 12 bit YUV to ARGB with matrix. +LIBYUV_API +int I012ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert 10 bit 422 YUV to ARGB with matrix. LIBYUV_API int I210ToARGBMatrix(const uint16_t* src_y, @@ -1496,6 +1592,87 @@ int I210ToARGBMatrix(const uint16_t* src_y, int width, int height); +// Convert 10 bit 444 YUV to ARGB with matrix. +LIBYUV_API +int I410ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert P010 to ARGB with matrix. +LIBYUV_API +int P010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert P210 to ARGB with matrix. +LIBYUV_API +int P210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert P010 to AR30 with matrix. +LIBYUV_API +int P010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert P210 to AR30 with matrix. +LIBYUV_API +int P210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// P012 and P010 use most significant bits so the conversion is the same. +// Convert P012 to ARGB with matrix. +#define P012ToARGBMatrix P010ToARGBMatrix +// Convert P012 to AR30 with matrix. +#define P012ToAR30Matrix P010ToAR30Matrix +// Convert P212 to ARGB with matrix. +#define P212ToARGBMatrix P210ToARGBMatrix +// Convert P212 to AR30 with matrix. +#define P212ToAR30Matrix P210ToAR30Matrix + +// Convert P016 to ARGB with matrix. +#define P016ToARGBMatrix P010ToARGBMatrix +// Convert P016 to AR30 with matrix. +#define P016ToAR30Matrix P010ToAR30Matrix +// Convert P216 to ARGB with matrix. +#define P216ToARGBMatrix P210ToARGBMatrix +// Convert P216 to AR30 with matrix. +#define P216ToAR30Matrix P210ToAR30Matrix + // Convert I420 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I420AlphaToARGBMatrix(const uint8_t* src_y, @@ -1547,6 +1724,57 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, int height, int attenuate); +// Convert I010 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I010AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate); + +// Convert I210 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I210AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate); + +// Convert I410 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I410AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate); + // Convert NV12 to ARGB with matrix. LIBYUV_API int NV12ToARGBMatrix(const uint8_t* src_y, diff --git a/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/libyuv/include/libyuv/convert_from_argb.h index d992363ceb..bf48786041 100644 --- a/third_party/libyuv/include/libyuv/convert_from_argb.h +++ b/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -153,6 +153,30 @@ int ARGBToI444(const uint8_t* src_argb, int width, int height); +// Convert ARGB to AR64. +LIBYUV_API +int ARGBToAR64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height); + +// Convert ABGR to AB64. +#define ABGRToAB64 ARGBToAR64 + +// Convert ARGB to AB64. +LIBYUV_API +int ARGBToAB64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height); + +// Convert ABGR to AR64. +#define ABGRToAR64 ARGBToAB64 + // Convert ARGB To I422. LIBYUV_API int ARGBToI422(const uint8_t* src_argb, diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h index ebefb5682f..def773cb44 100644 --- a/third_party/libyuv/include/libyuv/planar_functions.h +++ b/third_party/libyuv/include/libyuv/planar_functions.h @@ -229,6 +229,60 @@ void MergeARGBPlane(const uint8_t* src_r, int width, int height); +// Merge separate 'depth' bit R, G and B planes stored in lsb +// into one interleaved XR30 plane. +// depth should in range [10, 16] +LIBYUV_API +void MergeXR30Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height, + int depth); + +// Merge separate 'depth' bit R, G, B and A planes stored in lsb +// into one interleaved AR64 plane. +// src_a can be NULL to fill opaque value to alpha. +// depth should in range [1, 16] +LIBYUV_API +void MergeAR64Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth); + +// Merge separate 'depth' bit R, G, B and A planes stored in lsb +// into one interleaved ARGB plane. +// src_a can be NULL to fill opaque value to alpha. +// depth should in range [8, 16] +LIBYUV_API +void MergeARGB16To8Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth); + // Copy I400. Supports inverting. LIBYUV_API int I400ToI400(const uint8_t* src_y, @@ -945,7 +999,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, int width); // Shuffle ARGB channel order. e.g. BGRA to ARGB. -// shuffler is 16 bytes and must be aligned. +// shuffler is 16 bytes. LIBYUV_API int ARGBShuffle(const uint8_t* src_bgra, int src_stride_bgra, @@ -955,6 +1009,17 @@ int ARGBShuffle(const uint8_t* src_bgra, int width, int height); +// Shuffle AR64 channel order. e.g. AR64 to AB64. +// shuffler is 16 bytes. +LIBYUV_API +int AR64Shuffle(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ar64, + int dst_stride_ar64, + const uint8_t* shuffler, + int width, + int height); + // Sobel ARGB effect with planar output. LIBYUV_API int ARGBSobelToPlane(const uint8_t* src_argb, diff --git a/third_party/libyuv/include/libyuv/rotate_row.h b/third_party/libyuv/include/libyuv/rotate_row.h index 022293eef2..f4c701fb4f 100644 --- a/third_party/libyuv/include/libyuv/rotate_row.h +++ b/third_party/libyuv/include/libyuv/rotate_row.h @@ -32,8 +32,9 @@ extern "C" { #define LIBYUV_DISABLE_X86 #endif #endif -// The following are available for Visual C and clangcl 32 bit: -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// The following are available for Visual C 32 bit: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ + !defined(__clang__) #define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h index 68fb88b3e7..1444a04786 100644 --- a/third_party/libyuv/include/libyuv/row.h +++ b/third_party/libyuv/include/libyuv/row.h @@ -175,8 +175,8 @@ extern "C" { defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I444ALPHATOARGBROW_SSSE3 #define HAS_I422ALPHATOARGBROW_SSSE3 +#define HAS_I444ALPHATOARGBROW_SSSE3 #endif #endif @@ -240,15 +240,15 @@ extern "C" { defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I444ALPHATOARGBROW_AVX2 #define HAS_I422ALPHATOARGBROW_AVX2 +#define HAS_I444ALPHATOARGBROW_AVX2 #endif #endif -// The following are available for AVX2 Visual C and clangcl 32 bit: +// The following are available for AVX2 Visual C 32 bit: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ - (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) + !defined(__clang__) && defined(VISUALC_HAS_AVX2) #define HAS_ARGB1555TOARGBROW_AVX2 #define HAS_ARGB4444TOARGBROW_AVX2 #define HAS_ARGBTOARGB1555ROW_AVX2 @@ -269,33 +269,54 @@ extern "C" { // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 +#define HAS_ARGBTOAR64ROW_SSSE3 +#define HAS_ARGBTOAB64ROW_SSSE3 +#define HAS_AR64TOARGBROW_SSSE3 +#define HAS_AB64TOARGBROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 #define HAS_HALFMERGEUVROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 +#define HAS_I212TOAR30ROW_SSSE3 +#define HAS_I212TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOAR30ROW_SSSE3 +#define HAS_I410TOAR30ROW_SSSE3 +#define HAS_I410TOARGBROW_SSSE3 #define HAS_MERGEARGBROW_SSE2 +#define HAS_MERGEXRGBROW_SSE2 #define HAS_MERGERGBROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3 +#define HAS_P210TOAR30ROW_SSSE3 +#define HAS_P210TOARGBROW_SSSE3 +#define HAS_P410TOAR30ROW_SSSE3 +#define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITARGBROW_SSE2 #define HAS_SPLITARGBROW_SSSE3 +#define HAS_SPLITXRGBROW_SSE2 +#define HAS_SPLITXRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 + +#if defined(__x86_64__) || !defined(__pic__) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I210ALPHATOARGBROW_SSSE3 +#define HAS_I410ALPHATOARGBROW_SSSE3 +#endif #endif // The following are available for AVX2 gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_ABGRTOAR30ROW_AVX2 #define HAS_ABGRTOUVROW_AVX2 @@ -303,14 +324,32 @@ extern "C" { #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 +#define HAS_ARGBTOAR64ROW_AVX2 +#define HAS_ARGBTOAB64ROW_AVX2 +#define HAS_AR64TOARGBROW_AVX2 +#define HAS_AB64TOARGBROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 +#define HAS_MERGEAR64ROW_AVX2 +#define HAS_MERGEARGB16TO8ROW_AVX2 #define HAS_MERGEARGBROW_AVX2 +#define HAS_MERGEXR30ROW_AVX2 +#define HAS_MERGEXR64ROW_AVX2 +#define HAS_MERGEXRGB16TO8ROW_AVX2 +#define HAS_MERGEXRGBROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 +#define HAS_I212TOAR30ROW_AVX2 +#define HAS_I212TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2 +#define HAS_I410TOAR30ROW_AVX2 +#define HAS_I410TOARGBROW_AVX2 +#define HAS_P210TOAR30ROW_AVX2 +#define HAS_P210TOARGBROW_AVX2 +#define HAS_P410TOAR30ROW_AVX2 +#define HAS_P410TOARGBROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 #define HAS_I422TOUYVYROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2 @@ -319,18 +358,25 @@ extern "C" { #define HAS_MULTIPLYROW_16_AVX2 #define HAS_RGBATOYJROW_AVX2 #define HAS_SPLITARGBROW_AVX2 +#define HAS_SPLITXRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 #define HAS_SWAPUVROW_AVX2 // TODO(fbarchard): Fix AVX2 version of YUV24 // #define HAS_NV21TOYUV24ROW_AVX2 + +#if defined(__x86_64__) || !defined(__pic__) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I210ALPHATOARGBROW_AVX2 +#define HAS_I410ALPHATOARGBROW_AVX2 +#endif #endif // The following are available for AVX512 clang x86 platforms: // TODO(fbarchard): Port to GCC and Visual C // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ - (defined(CLANG_HAS_AVX512)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512)) #define HAS_ARGBTORGB24ROW_AVX512VBMI #endif @@ -353,6 +399,10 @@ extern "C" { #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBTOAR64ROW_NEON +#define HAS_ARGBTOAB64ROW_NEON +#define HAS_AR64TOARGBROW_NEON +#define HAS_AB64TOARGBROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON @@ -381,7 +431,13 @@ extern "C" { #define HAS_I422TOYUY2ROW_NEON #define HAS_I444TOARGBROW_NEON #define HAS_J400TOARGBROW_NEON +#define HAS_MERGEAR64ROW_NEON +#define HAS_MERGEARGB16TO8ROW_NEON #define HAS_MERGEARGBROW_NEON +#define HAS_MERGEXR30ROW_NEON +#define HAS_MERGEXR64ROW_NEON +#define HAS_MERGEXRGB16TO8ROW_NEON +#define HAS_MERGEXRGBROW_NEON #define HAS_MERGEUVROW_NEON #define HAS_MERGEUVROW_16_NEON #define HAS_MIRRORROW_NEON @@ -412,6 +468,7 @@ extern "C" { #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON #define HAS_SPLITARGBROW_NEON +#define HAS_SPLITXRGBROW_NEON #define HAS_SPLITRGBROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_16_NEON @@ -490,24 +547,14 @@ extern "C" { #define HAS_BGRATOYROW_MSA #define HAS_HALFFLOATROW_MSA #define HAS_I400TOARGBROW_MSA -#define HAS_I422ALPHATOARGBROW_MSA -#define HAS_I422TOARGBROW_MSA -#define HAS_I422TORGB24ROW_MSA -#define HAS_I422TORGBAROW_MSA #define HAS_I422TOUYVYROW_MSA #define HAS_I422TOYUY2ROW_MSA -#define HAS_I444TOARGBROW_MSA -#define HAS_I422TOARGB1555ROW_MSA -#define HAS_I422TORGB565ROW_MSA #define HAS_INTERPOLATEROW_MSA #define HAS_J400TOARGBROW_MSA #define HAS_MERGEUVROW_MSA #define HAS_MIRRORROW_MSA #define HAS_MIRRORUVROW_MSA #define HAS_MIRRORSPLITUVROW_MSA -#define HAS_NV12TOARGBROW_MSA -#define HAS_NV12TORGB565ROW_MSA -#define HAS_NV21TOARGBROW_MSA #define HAS_RAWTOARGBROW_MSA #define HAS_RAWTORGB24ROW_MSA #define HAS_RAWTOUVROW_MSA @@ -527,10 +574,8 @@ extern "C" { #define HAS_SOBELXYROW_MSA #define HAS_SOBELYROW_MSA #define HAS_SPLITUVROW_MSA -#define HAS_UYVYTOARGBROW_MSA #define HAS_UYVYTOUVROW_MSA #define HAS_UYVYTOYROW_MSA -#define HAS_YUY2TOARGBROW_MSA #define HAS_YUY2TOUV422ROW_MSA #define HAS_YUY2TOUVROW_MSA #define HAS_YUY2TOYROW_MSA @@ -580,8 +625,6 @@ extern "C" { #define HAS_I400TOARGBROW_MMI #define HAS_I422TOUYVYROW_MMI #define HAS_I422TOYUY2ROW_MMI -#define HAS_I422TOARGBROW_MMI -#define HAS_I444TOARGBROW_MMI #define HAS_INTERPOLATEROW_MMI #define HAS_J400TOARGBROW_MMI #define HAS_MERGERGBROW_MMI @@ -612,20 +655,6 @@ extern "C" { #define HAS_YUY2TOUV422ROW_MMI #define HAS_YUY2TOUVROW_MMI #define HAS_YUY2TOYROW_MMI -#define HAS_I210TOARGBROW_MMI -#define HAS_I422TOARGB4444ROW_MMI -#define HAS_I422TOARGB1555ROW_MMI -#define HAS_I422TORGB565ROW_MMI -#define HAS_NV21TORGB24ROW_MMI -#define HAS_NV12TORGB24ROW_MMI -#define HAS_I422ALPHATOARGBROW_MMI -#define HAS_I422TORGB24ROW_MMI -#define HAS_NV12TOARGBROW_MMI -#define HAS_NV21TOARGBROW_MMI -#define HAS_NV12TORGB565ROW_MMI -#define HAS_YUY2TOARGBROW_MMI -#define HAS_UYVYTOARGBROW_MMI -#define HAS_I422TORGBAROW_MMI #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -634,6 +663,7 @@ extern "C" { #else #define SIMD_ALIGNED(var) __declspec(align(16)) var #endif +#define LIBYUV_NOINLINE __declspec(noinline) typedef __declspec(align(16)) int16_t vec16[8]; typedef __declspec(align(16)) int32_t vec32[4]; typedef __declspec(align(16)) float vecf32[4]; @@ -654,6 +684,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32]; #else #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #endif +#define LIBYUV_NOINLINE __attribute__((noinline)) typedef int16_t __attribute__((vector_size(16))) vec16; typedef int32_t __attribute__((vector_size(16))) vec32; typedef float __attribute__((vector_size(16))) vecf32; @@ -669,6 +700,7 @@ typedef uint32_t __attribute__((vector_size(32))) ulvec32; typedef uint8_t __attribute__((vector_size(32))) ulvec8; #else #define SIMD_ALIGNED(var) var +#define LIBYUV_NOINLINE typedef int16_t vec16[8]; typedef int32_t vec32[4]; typedef float vecf32[4]; @@ -684,33 +716,18 @@ typedef uint32_t ulvec32[8]; typedef uint8_t ulvec8[32]; #endif -#if defined(__aarch64__) -// This struct is for Arm64 color conversion. -struct YuvConstants { - uvec16 kUVToRB; - uvec16 kUVToRB2; - uvec16 kUVToG; - uvec16 kUVToG2; - vec16 kUVBiasBGR; - vec32 kYToRgb; -}; -#elif defined(__arm__) -// This struct is for ArmV7 color conversion. +#if defined(__aarch64__) || defined(__arm__) +// This struct is for ARM color conversion. struct YuvConstants { - uvec8 kUVToRB; - uvec8 kUVToG; - vec16 kUVBiasBGR; - vec32 kYToRgb; + uvec8 kUVCoeff; + vec16 kRGBCoeffBias; }; #else // This struct is for Intel color conversion. struct YuvConstants { - int8_t kUVToB[32]; - int8_t kUVToG[32]; - int8_t kUVToR[32]; - int16_t kUVBiasB[16]; - int16_t kUVBiasG[16]; - int16_t kUVBiasR[16]; + uint8_t kUVToB[32]; + uint8_t kUVToG[32]; + uint8_t kUVToR[32]; int16_t kYToRgb[16]; int16_t kYBiasToRgb[16]; }; @@ -719,11 +736,8 @@ struct YuvConstants { #define KUVTOB 0 #define KUVTOG 32 #define KUVTOR 64 -#define KUVBIASB 96 -#define KUVBIASG 128 -#define KUVBIASR 160 -#define KYTORGB 192 -#define KYBIASTORGB 224 +#define KYTORGB 96 +#define KYBIASTORGB 128 #endif @@ -995,11 +1009,11 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); -void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); -void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); -void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width); -void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width); +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); @@ -1194,16 +1208,16 @@ void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width); -void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width); -void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width); +void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); @@ -1305,42 +1319,42 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_v, int width); void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -1349,7 +1363,7 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_v, int width); void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -1372,47 +1386,47 @@ void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -1621,7 +1635,7 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv, void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); @@ -1637,9 +1651,13 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); -void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width); +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width); +void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1860,23 +1878,23 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width); -void MergeARGBRow_Any_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, +void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, int width); -void MergeARGBRow_Any_AVX2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, +void MergeARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, int width); -void MergeARGBRow_Any_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, +void MergeARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, int width); void SplitARGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, @@ -1902,31 +1920,31 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_NEON(const uint8_t* src_argb, +void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_SSE2(const uint8_t* src_argb, +void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_SSSE3(const uint8_t* src_argb, +void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_AVX2(const uint8_t* src_argb, +void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_NEON(const uint8_t* src_argb, +void SplitARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, @@ -1952,20 +1970,20 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width); -void MergeXRGBRow_Any_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, +void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, int width); -void MergeXRGBRow_Any_AVX2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, +void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, int width); -void MergeXRGBRow_Any_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, +void MergeXRGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, int width); void SplitXRGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, @@ -1987,32 +2005,205 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_NEON(const uint8_t* src_argb, +void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_SSE2(const uint8_t* src_argb, +void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_SSSE3(const uint8_t* src_argb, +void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_AVX2(const uint8_t* src_argb, +void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_NEON(const uint8_t* src_argb, +void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); +void MergeXR30Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeAR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width); +void MergeARGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width); +void MergeXRGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR30Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeAR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width); +void MergeARGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width); +void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width); +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width); +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width); +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR30Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, + int depth, + int width); +void MergeAR64Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + const uint16_t* a_buf, + uint16_t* dst_ptr, + int depth, + int width); +void MergeXR64Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint16_t* dst_ptr, + int depth, + int width); +void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, + int depth, + int width); +void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, + int depth, + int width); +void MergeXR30Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, + int depth, + int width); +void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, + int depth, + int width); +void MergeAR64Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + const uint16_t* a_buf, + uint16_t* dst_ptr, + int depth, + int width); +void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, + int depth, + int width); +void MergeXR64Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint16_t* dst_ptr, + int depth, + int width); +void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, + int depth, + int width); + void MergeUVRow_16_C(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, @@ -2024,10 +2215,10 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, int depth, int width); void MergeUVRow_16_Any_AVX2(const uint16_t* src_u, - const uint16_t* src_v, - uint16_t* dst_uv, - int depth, - int width); + const uint16_t* src_v, + uint16_t* dst_uv, + int depth, + int width); void MergeUVRow_16_NEON(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, @@ -2073,16 +2264,16 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); -void MultiplyRow_16_Any_AVX2(const uint16_t* src_y, - uint16_t* dst_y, +void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, int scale, int width); void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); -void MultiplyRow_16_Any_NEON(const uint16_t* src_y, - uint16_t* dst_y, +void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, int scale, int width); @@ -2094,16 +2285,16 @@ void DivideRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); -void DivideRow_16_Any_AVX2(const uint16_t* src_y, - uint16_t* dst_y, +void DivideRow_16_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, int scale, int width); void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); -void DivideRow_16_Any_NEON(const uint16_t* src_y, - uint16_t* dst_y, +void DivideRow_16_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, int scale, int width); @@ -2527,6 +2718,71 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); +void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width); +void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width); +void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width); +void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width); +void AR64ShuffleRow_C(const uint8_t* src_ar64, + uint8_t* dst_ar64, + const uint8_t* shuffler, + int width); +void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width); +void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width); +void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width); +void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width); +void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width); +void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width); +void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width); +void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width); +void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width); +void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width); +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width); +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width); +void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void ARGBToAB64Row_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void AR64ToARGBRow_Any_AVX2(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void AB64ToARGBRow_Any_AVX2(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR64Row_Any_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void ARGBToAB64Row_Any_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void AR64ToARGBRow_Any_NEON(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); + void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); @@ -2575,6 +2831,44 @@ void I210ToARGBRow_C(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void I212ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I212ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -2626,6 +2920,27 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void P210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); + void I422ToRGBARow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -2705,6 +3020,44 @@ void I210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I212ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I212ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToAR30Row_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2723,6 +3076,44 @@ void I210ToAR30Row_AVX2(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); +void I212ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I212ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2821,6 +3212,48 @@ void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); + +void P210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); + void I422ToRGBARow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2923,6 +3356,44 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2941,6 +3412,44 @@ void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I410ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3039,6 +3548,46 @@ void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3120,15 +3669,15 @@ void I400ToARGBRow_MMI(const uint8_t* src_y, int width); void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, + const struct YuvConstants* param, int width); void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, + const struct YuvConstants* param, int width); void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, + const struct YuvConstants* param, int width); void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -3140,11 +3689,11 @@ void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, int width); // ARGB preattenuated alpha blend. -void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, +void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBBlendRow_NEON(const uint8_t* src_argb0, +void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3156,7 +3705,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBBlendRow_C(const uint8_t* src_argb0, +void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3200,11 +3749,11 @@ void BlendPlaneRow_C(const uint8_t* src0, // ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBMultiplyRow_C(const uint8_t* src_argb0, +void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3212,7 +3761,7 @@ void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3220,7 +3769,7 @@ void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3246,11 +3795,11 @@ void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf, int width); // ARGB add images. -void ARGBAddRow_C(const uint8_t* src_argb0, +void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBAddRow_SSE2(const uint8_t* src_argb0, +void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3258,7 +3807,7 @@ void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBAddRow_AVX2(const uint8_t* src_argb0, +void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3266,7 +3815,7 @@ void ARGBAddRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBAddRow_NEON(const uint8_t* src_argb0, +void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3293,11 +3842,11 @@ void ARGBAddRow_Any_MMI(const uint8_t* y_buf, // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBSubtractRow_C(const uint8_t* src_argb0, +void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, +void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3305,7 +3854,7 @@ void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, +void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3313,7 +3862,7 @@ void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBSubtractRow_NEON(const uint8_t* src_argb0, +void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3520,9 +4069,9 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, +void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, int width); void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, @@ -3537,6 +4086,46 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void P210ToARGBRow_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P210ToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3684,7 +4273,7 @@ void YUY2ToUV422Row_C(const uint8_t* src_yuy2, int width); void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -3694,7 +4283,7 @@ void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, int width); void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -3704,7 +4293,7 @@ void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, int width); void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -3805,7 +4394,7 @@ void UYVYToUV422Row_C(const uint8_t* src_uyvy, int width); void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -3815,7 +4404,7 @@ void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, int width); void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -3825,7 +4414,7 @@ void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, int width); void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -3862,29 +4451,29 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToUVRow_C(const uint8_t* src_ayuv, - int stride_ayuv, + int src_stride_ayuv, uint8_t* dst_uv, int width); void AYUVToVURow_C(const uint8_t* src_ayuv, - int stride_ayuv, + int src_stride_ayuv, uint8_t* dst_vu, int width); void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToUVRow_NEON(const uint8_t* src_ayuv, - int stride_ayuv, + int src_stride_ayuv, uint8_t* dst_uv, int width); void AYUVToVURow_NEON(const uint8_t* src_ayuv, - int stride_ayuv, + int src_stride_ayuv, uint8_t* dst_vu, int width); -void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); -void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv, - int stride_ayuv, - uint8_t* dst_uv, +void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_vu, int width); -void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv, - int stride_ayuv, +void AYUVToVURow_Any_NEON(const uint8_t* src_ptr, + int src_stride, uint8_t* dst_vu, int width); diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h index 18ffb546a3..461ac36f33 100644 --- a/third_party/libyuv/include/libyuv/scale_row.h +++ b/third_party/libyuv/include/libyuv/scale_row.h @@ -74,15 +74,16 @@ extern "C" { // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define HAS_SCALEUVROWDOWN2BOX_SSSE3 #define HAS_SCALEROWUP2LINEAR_SSE2 #define HAS_SCALEROWUP2LINEAR_SSSE3 #define HAS_SCALEROWUP2BILINEAR_SSE2 #define HAS_SCALEROWUP2BILINEAR_SSSE3 -#define HAS_SCALEROWUP2LINEAR_16_SSSE3 -#define HAS_SCALEROWUP2BILINEAR_16_SSSE3 +#define HAS_SCALEROWUP2LINEAR_12_SSSE3 +#define HAS_SCALEROWUP2BILINEAR_12_SSSE3 +#define HAS_SCALEROWUP2LINEAR_16_SSE2 +#define HAS_SCALEROWUP2BILINEAR_16_SSE2 #define HAS_SCALEUVROWUP2LINEAR_SSSE3 #define HAS_SCALEUVROWUP2BILINEAR_SSSE3 #define HAS_SCALEUVROWUP2LINEAR_16_SSE2 @@ -92,12 +93,14 @@ extern "C" { // The following are available for gcc/clang x86 platforms, but // require clang 3.4 or gcc 4.7. // TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \ +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_SCALEUVROWDOWN2BOX_AVX2 #define HAS_SCALEROWUP2LINEAR_AVX2 #define HAS_SCALEROWUP2BILINEAR_AVX2 +#define HAS_SCALEROWUP2LINEAR_12_AVX2 +#define HAS_SCALEROWUP2BILINEAR_12_AVX2 #define HAS_SCALEROWUP2LINEAR_16_AVX2 #define HAS_SCALEROWUP2BILINEAR_16_AVX2 #define HAS_SCALEUVROWUP2LINEAR_AVX2 @@ -134,6 +137,8 @@ extern "C" { #define HAS_SCALEUVROWDOWNEVEN_NEON #define HAS_SCALEROWUP2LINEAR_NEON #define HAS_SCALEROWUP2BILINEAR_NEON +#define HAS_SCALEROWUP2LINEAR_12_NEON +#define HAS_SCALEROWUP2BILINEAR_12_NEON #define HAS_SCALEROWUP2LINEAR_16_NEON #define HAS_SCALEROWUP2BILINEAR_16_NEON #define HAS_SCALEUVROWUP2LINEAR_NEON @@ -611,14 +616,22 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); -void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, +void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); @@ -635,6 +648,14 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); @@ -651,9 +672,17 @@ void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowUp2_Linear_16_Any_SSSE3(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); +void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, @@ -675,6 +704,14 @@ void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); @@ -1424,6 +1461,14 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); @@ -1440,6 +1485,14 @@ void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h index e59b316a60..f713c47704 100644 --- a/third_party/libyuv/include/libyuv/version.h +++ b/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1779 +#define LIBYUV_VERSION 1788 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/third_party/libyuv/include/libyuv/video_common.h b/third_party/libyuv/include/libyuv/video_common.h index 0da3fb5544..32b8a5210b 100644 --- a/third_party/libyuv/include/libyuv/video_common.h +++ b/third_party/libyuv/include/libyuv/video_common.h @@ -65,12 +65,14 @@ enum FourCC { // 1 Secondary YUV format: row biplanar. deprecated. FOURCC_M420 = FOURCC('M', '4', '2', '0'), - // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc + // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit + FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel. + FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), @@ -163,6 +165,8 @@ enum FourCCBpp { FOURCC_BPP_RGBA = 32, FOURCC_BPP_AR30 = 32, FOURCC_BPP_AB30 = 32, + FOURCC_BPP_AR64 = 64, + FOURCC_BPP_AB64 = 64, FOURCC_BPP_24BG = 24, FOURCC_BPP_RAW = 24, FOURCC_BPP_RGBP = 16, diff --git a/third_party/libyuv/source/compare_common.cc b/third_party/libyuv/source/compare_common.cc index d4b170ad98..d1cab8d2b4 100644 --- a/third_party/libyuv/source/compare_common.cc +++ b/third_party/libyuv/source/compare_common.cc @@ -17,36 +17,6 @@ namespace libyuv { extern "C" { #endif -#if ORIGINAL_OPT -uint32_t HammingDistance_C1(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - int i; - for (i = 0; i < count; ++i) { - int x = src_a[i] ^ src_b[i]; - if (x & 1) - ++diff; - if (x & 2) - ++diff; - if (x & 4) - ++diff; - if (x & 8) - ++diff; - if (x & 16) - ++diff; - if (x & 32) - ++diff; - if (x & 64) - ++diff; - if (x & 128) - ++diff; - } - return diff; -} -#endif - // Hakmem method for hamming distance. uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, diff --git a/third_party/libyuv/source/compare_gcc.cc b/third_party/libyuv/source/compare_gcc.cc index 6700f9697e..b834b42ac4 100644 --- a/third_party/libyuv/source/compare_gcc.cc +++ b/third_party/libyuv/source/compare_gcc.cc @@ -19,8 +19,7 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if defined(__x86_64__) uint32_t HammingDistance_SSE42(const uint8_t* src_a, diff --git a/third_party/libyuv/source/compare_win.cc b/third_party/libyuv/source/compare_win.cc index d57d3d9d1c..9bb27f1dd1 100644 --- a/third_party/libyuv/source/compare_win.cc +++ b/third_party/libyuv/source/compare_win.cc @@ -22,8 +22,9 @@ namespace libyuv { extern "C" { #endif -// This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, @@ -77,8 +78,7 @@ __declspec(naked) uint32_t } } -// Visual C 2012 required for AVX2. -#if _MSC_VER >= 1700 +#ifdef HAS_SUMSQUAREERROR_AVX2 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. #pragma warning(disable : 4752) __declspec(naked) uint32_t @@ -118,7 +118,7 @@ __declspec(naked) uint32_t ret } } -#endif // _MSC_VER >= 1700 +#endif // HAS_SUMSQUAREERROR_AVX2 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 uvec32 kHashMul0 = { @@ -196,7 +196,7 @@ __declspec(naked) uint32_t } // Visual C 2012 required for AVX2. -#if _MSC_VER >= 1700 +#ifdef HAS_HASHDJB2_AVX2 __declspec(naked) uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { __asm { @@ -231,7 +231,7 @@ __declspec(naked) uint32_t ret } } -#endif // _MSC_VER >= 1700 +#endif // HAS_HASHDJB2_AVX2 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc index 1bd596599b..69f7fb6e01 100644 --- a/third_party/libyuv/source/convert.cc +++ b/third_party/libyuv/source/convert.cc @@ -400,7 +400,7 @@ int I210ToI010(const uint16_t* src_y, } // Any I[420]1[02] to P[420]1[02] format with mirroring. -static int Ix1xToPx1x(const uint16_t* src_y, +static int IxxxToPxxx(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, @@ -441,7 +441,7 @@ int I010ToP010(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 1, 10); } @@ -459,7 +459,7 @@ int I210ToP210(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 0, 10); } @@ -477,7 +477,7 @@ int I012ToP012(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 1, 12); } @@ -495,7 +495,7 @@ int I212ToP212(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 0, 12); } @@ -1368,6 +1368,18 @@ int ARGBToI420(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } +#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + ARGBToUVRow = ARGBToUVRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -1388,22 +1400,6 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } -#endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; @@ -1771,7 +1767,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, } // Neon version does direct RGB24 to YUV. -#if defined(HAS_RGB24TOYROW_NEON) +#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVRow = RGB24ToUVRow_Any_NEON; RGB24ToYRow = RGB24ToYRow_Any_NEON; @@ -1808,6 +1804,14 @@ int RGB24ToI420(const uint8_t* src_rgb24, #endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -1816,6 +1820,18 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -1960,6 +1976,14 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } #endif #else +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -1968,6 +1992,18 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; @@ -2111,6 +2147,26 @@ int RAWToI420(const uint8_t* src_raw, #endif // Other platforms do intermediate conversion from RAW to ARGB. #else +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_NEON; + } + } + } +#endif #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -2186,6 +2242,178 @@ int RAWToI420(const uint8_t* src_raw, return 0; } +// TODO(fbarchard): Use Matrix version to implement I420 and J420. +// Convert RAW to J420. +LIBYUV_API +int RAWToJ420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; +#if (defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI) + void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RAWToUVJRow_C; + void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + RAWToYJRow_C; +#else + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = + ARGBToYJRow_C; +#endif + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + +// Neon version does direct RAW to YUV. +#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVJRow = RAWToUVJRow_Any_NEON; + RAWToYJRow = RAWToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToYJRow = RAWToYJRow_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVJRow = RAWToUVJRow_NEON; + } + } + } +// MMI and MSA version does direct RAW to YUV. +#elif (defined(HAS_RAWTOYJROW_MMI) || defined(HAS_RAWTOYJROW_MSA)) +#if defined(HAS_RAWTOYJROW_MMI) && defined(HAS_RAWTOUVJROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + RAWToUVJRow = RAWToUVJRow_Any_MMI; + RAWToYJRow = RAWToYJRow_Any_MMI; + if (IS_ALIGNED(width, 8)) { + RAWToYJRow = RAWToYJRow_MMI; + if (IS_ALIGNED(width, 16)) { + RAWToUVJRow = RAWToUVJRow_MMI; + } + } + } +#endif +#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVJRow = RAWToUVJRow_Any_MSA; + RAWToYJRow = RAWToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYJRow = RAWToYJRow_MSA; + RAWToUVJRow = RAWToUVJRow_MSA; + } + } +#endif +#else +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } + } +#endif +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif +#endif + + { +#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) + // Allocate 2 rows of ARGB. + const int kRowSize = (width * 4 + 31) & ~31; + align_buffer_64(row, kRowSize * 2); +#endif + + for (y = 0; y < height - 1; y += 2) { +#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) + RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width); + RAWToYJRow(src_raw, dst_y, width); + RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); +#else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); + ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); + ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); +#endif + src_raw += src_stride_raw * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { +#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) + RAWToUVJRow(src_raw, 0, dst_u, dst_v, width); + RAWToYJRow(src_raw, dst_y, width); +#else + RAWToARGBRow(src_raw, row, width); + ARGBToUVJRow(row, 0, dst_u, dst_v, width); + ARGBToYJRow(row, dst_y, width); +#endif + } +#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ + defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)) + free_aligned_buffer_64(row); +#endif + } + return 0; +} + // Convert RGB565 to I420. LIBYUV_API int RGB565ToI420(const uint8_t* src_rgb565, diff --git a/third_party/libyuv/source/convert_argb.cc b/third_party/libyuv/source/convert_argb.cc index 87d7d73250..d8f7b27738 100644 --- a/third_party/libyuv/source/convert_argb.cc +++ b/third_party/libyuv/source/convert_argb.cc @@ -888,6 +888,63 @@ int U010ToAB30(const uint16_t* src_y, &kYuv2020Constants, width, height); } +// Convert 12 bit YUV to ARGB with matrix. +// TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to +// multiply 12 bit yuv into high bits to allow any number of bits. +LIBYUV_API +int I012ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I212ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I212TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I212ToAR30Row = I212ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I212ToAR30Row = I212ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I212TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I212ToAR30Row = I212ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I212ToAR30Row = I212ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + // Convert 10 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. @@ -1045,6 +1102,58 @@ int U210ToAB30(const uint16_t* src_y, &kYuv2020Constants, width, height); } +LIBYUV_API +int I410ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToAR30Row = I410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToAR30Row = I410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToAR30Row = I410ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + // Convert 10 bit YUV to ARGB with matrix. LIBYUV_API int I010ToARGBMatrix(const uint16_t* src_y, @@ -1088,14 +1197,6 @@ int I010ToARGBMatrix(const uint16_t* src_y, } } #endif -#if defined(HAS_I210TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I210ToARGBRow = I210ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I210ToARGBRow = I210ToARGBRow_MMI; - } - } -#endif for (y = 0; y < height; ++y) { I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; @@ -1216,6 +1317,61 @@ int U010ToABGR(const uint16_t* src_y, width, height); } +// Convert 12 bit YUV to ARGB with matrix. +LIBYUV_API +int I012ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I212ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I212TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I212ToARGBRow = I212ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I212ToARGBRow = I212ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I212TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I212ToARGBRow = I212ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I212ToARGBRow = I212ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + // Convert 10 bit 422 YUV to ARGB with matrix. LIBYUV_API int I210ToARGBMatrix(const uint16_t* src_y, @@ -1259,14 +1415,6 @@ int I210ToARGBMatrix(const uint16_t* src_y, } } #endif -#if defined(HAS_I210TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I210ToARGBRow = I210ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I210ToARGBRow = I210ToARGBRow_MMI; - } - } -#endif for (y = 0; y < height; ++y) { I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; @@ -1385,6 +1533,254 @@ int U210ToABGR(const uint16_t* src_y, width, height); } +LIBYUV_API +int I410ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToARGBRow = I410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToARGBRow = I410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToARGBRow = I410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToARGBRow = I410ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +LIBYUV_API +int P010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToARGBRow = P210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToARGBRow = P210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToARGBRow = P210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +LIBYUV_API +int P210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToARGBRow)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToARGBRow = P210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToARGBRow = P210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToARGBRow = P210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + return 0; +} + +LIBYUV_API +int P010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToAR30Row = P210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToAR30Row = P210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToAR30Row = P210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +LIBYUV_API +int P210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToAR30Row)( + const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToAR30Row = P210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToAR30Row = P210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToAR30Row = P210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + return 0; +} + // Convert I420 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I420AlphaToARGBMatrix(const uint8_t* src_y, @@ -1903,6 +2299,323 @@ int I444AlphaToABGR(const uint8_t* src_y, width, height, attenuate); } +// Convert I010 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I010AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I210AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I210 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I210AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I210AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I410 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I410AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I410AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + // Convert I400 to ARGB with matrix. LIBYUV_API int I400ToARGBMatrix(const uint8_t* src_y, @@ -2078,6 +2791,10 @@ static const uvec8 kShuffleMaskABGRToARGB = { static const uvec8 kShuffleMaskRGBAToARGB = { 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; +// Shuffle table for converting AR64 to AB64. +static const uvec8 kShuffleMaskAR64ToAB64 = { + 4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u}; + // Convert BGRA to ARGB. LIBYUV_API int BGRAToARGB(const uint8_t* src_bgra, @@ -2087,7 +2804,7 @@ int BGRAToARGB(const uint8_t* src_bgra, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); + (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); } // Convert ARGB to BGRA (same as BGRAToARGB). @@ -2099,7 +2816,7 @@ int ARGBToBGRA(const uint8_t* src_bgra, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); + (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); } // Convert ABGR to ARGB. @@ -2111,7 +2828,7 @@ int ABGRToARGB(const uint8_t* src_abgr, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); + (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); } // Convert ARGB to ABGR to (same as ABGRToARGB). @@ -2123,7 +2840,7 @@ int ARGBToABGR(const uint8_t* src_abgr, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); + (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); } // Convert RGBA to ARGB. @@ -2135,7 +2852,19 @@ int RGBAToARGB(const uint8_t* src_rgba, int width, int height) { return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height); + (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height); +} + +// Convert AR64 To AB64. +LIBYUV_API +int AR64ToAB64(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64, + (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height); } // Convert RGB24 to ARGB. @@ -2644,6 +3373,124 @@ int AR30ToAB30(const uint8_t* src_ar30, return 0; } +// Convert AR64 to ARGB. +LIBYUV_API +int AR64ToARGB(const uint16_t* src_ar64, + int src_stride_ar64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, + int width) = AR64ToARGBRow_C; + if (!src_ar64 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; + src_stride_ar64 = -src_stride_ar64; + } + // Coalesce rows. + if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar64 = dst_stride_argb = 0; + } +#if defined(HAS_AR64TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + AR64ToARGBRow = AR64ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AR64ToARGBRow = AR64ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + AR64ToARGBRow = AR64ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AR64ToARGBRow = AR64ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + AR64ToARGBRow = AR64ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + AR64ToARGBRow(src_ar64, dst_argb, width); + src_ar64 += src_stride_ar64; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AB64 to ARGB. +LIBYUV_API +int AB64ToARGB(const uint16_t* src_ab64, + int src_stride_ab64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, + int width) = AB64ToARGBRow_C; + if (!src_ab64 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ab64 = src_ab64 + (height - 1) * src_stride_ab64; + src_stride_ab64 = -src_stride_ab64; + } + // Coalesce rows. + if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ab64 = dst_stride_argb = 0; + } +#if defined(HAS_AB64TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + AB64ToARGBRow = AB64ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AB64ToARGBRow = AB64ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + AB64ToARGBRow = AB64ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AB64ToARGBRow = AB64ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + AB64ToARGBRow = AB64ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + AB64ToARGBRow(src_ab64, dst_argb, width); + src_ab64 += src_stride_ab64; + dst_argb += dst_stride_argb; + } + return 0; +} + // Convert NV12 to ARGB with matrix. LIBYUV_API int NV12ToARGBMatrix(const uint8_t* src_y, @@ -4463,6 +5310,40 @@ int H420ToAR30(const uint8_t* src_y, &kYvuH709Constants, width, height); } +// Convert I420 to AB30. +LIBYUV_API +int I420ToAB30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H420 to AB30. +LIBYUV_API +int H420ToAB30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc index 4ba4bb5e0f..e14615847d 100644 --- a/third_party/libyuv/source/convert_from_argb.cc +++ b/third_party/libyuv/source/convert_from_argb.cc @@ -2009,6 +2009,124 @@ int ARGBToJ422(const uint8_t* src_argb, return 0; } +// Convert ARGB to AR64. +LIBYUV_API +int ARGBToAR64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height) { + int y; + void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAR64Row_C; + if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar64 = 0; + } +#if defined(HAS_ARGBTOAR64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR64Row = ARGBToAR64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAR64Row = ARGBToAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAR64Row(src_argb, dst_ar64, width); + src_argb += src_stride_argb; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + +// Convert ARGB to AB64. +LIBYUV_API +int ARGBToAB64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + int y; + void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAB64Row_C; + if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ab64 = 0; + } +#if defined(HAS_ARGBTOAB64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAB64Row = ARGBToAB64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAB64Row = ARGBToAB64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAB64Row(src_argb, dst_ab64, width); + src_argb += src_stride_argb; + dst_ab64 += dst_stride_ab64; + } + return 0; +} + // Convert ARGB to J400. LIBYUV_API int ARGBToJ400(const uint8_t* src_argb, diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc index 219c216509..7cea06c8d7 100644 --- a/third_party/libyuv/source/planar_functions.cc +++ b/third_party/libyuv/source/planar_functions.cc @@ -10,6 +10,7 @@ #include "libyuv/planar_functions.h" +#include <assert.h> #include <string.h> // for memset() #include "libyuv/cpu_id.h" @@ -563,9 +564,9 @@ void SplitUVPlane_16(const uint16_t* src_uv, int height, int depth) { int y; - int scale = 1 << depth; - void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, - int scale, int width) = SplitUVRow_16_C; + void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u, + uint16_t* dst_v, int depth, int width) = + SplitUVRow_16_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -583,24 +584,24 @@ void SplitUVPlane_16(const uint16_t* src_uv, } #if defined(HAS_SPLITUVROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow = SplitUVRow_16_Any_AVX2; + SplitUVRow_16 = SplitUVRow_16_Any_AVX2; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_16_AVX2; + SplitUVRow_16 = SplitUVRow_16_AVX2; } } #endif #if defined(HAS_SPLITUVROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow = SplitUVRow_16_Any_NEON; + SplitUVRow_16 = SplitUVRow_16_Any_NEON; if (IS_ALIGNED(width, 8)) { - SplitUVRow = SplitUVRow_16_NEON; + SplitUVRow_16 = SplitUVRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { // Copy a row of UV. - SplitUVRow(src_uv, dst_u, dst_v, scale, width); + SplitUVRow_16(src_uv, dst_u, dst_v, depth, width); dst_u += dst_stride_u; dst_v += dst_stride_v; src_uv += src_stride_uv; @@ -618,9 +619,11 @@ void MergeUVPlane_16(const uint16_t* src_u, int height, int depth) { int y; - int scale = 1 << (16 - depth); - void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v, - uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C; + void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v, + uint16_t* dst_uv, int depth, int width) = + MergeUVRow_16_C; + assert(depth >= 8); + assert(depth <= 16); // Negative height means invert the image. if (height < 0) { height = -height; @@ -636,24 +639,24 @@ void MergeUVPlane_16(const uint16_t* src_u, } #if defined(HAS_MERGEUVROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow = MergeUVRow_16_Any_AVX2; + MergeUVRow_16 = MergeUVRow_16_Any_AVX2; if (IS_ALIGNED(width, 16)) { - MergeUVRow = MergeUVRow_16_AVX2; + MergeUVRow_16 = MergeUVRow_16_AVX2; } } #endif #if defined(HAS_MERGEUVROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow = MergeUVRow_16_Any_NEON; + MergeUVRow_16 = MergeUVRow_16_Any_NEON; if (IS_ALIGNED(width, 8)) { - MergeUVRow = MergeUVRow_16_NEON; + MergeUVRow_16 = MergeUVRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. - MergeUVRow(src_u, src_v, dst_uv, scale, width); + MergeUVRow_16(src_u, src_v, dst_uv, depth, width); src_u += src_stride_u; src_v += src_stride_v; dst_uv += dst_stride_uv; @@ -671,8 +674,8 @@ void ConvertToMSBPlane_16(const uint16_t* src_y, int depth) { int y; int scale = 1 << (16 - depth); - void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale, - int width) = MultiplyRow_16_C; + void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale, + int width) = MultiplyRow_16_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -688,23 +691,23 @@ void ConvertToMSBPlane_16(const uint16_t* src_y, #if defined(HAS_MULTIPLYROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - MultiplyRow = MultiplyRow_16_Any_AVX2; + MultiplyRow_16 = MultiplyRow_16_Any_AVX2; if (IS_ALIGNED(width, 32)) { - MultiplyRow = MultiplyRow_16_AVX2; + MultiplyRow_16 = MultiplyRow_16_AVX2; } } #endif #if defined(HAS_MULTIPLYROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - MultiplyRow = MultiplyRow_16_Any_NEON; + MultiplyRow_16 = MultiplyRow_16_Any_NEON; if (IS_ALIGNED(width, 16)) { - MultiplyRow = MultiplyRow_16_NEON; + MultiplyRow_16 = MultiplyRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { - MultiplyRow(src_y, dst_y, scale, width); + MultiplyRow_16(src_y, dst_y, scale, width); src_y += src_stride_y; dst_y += dst_stride_y; } @@ -982,6 +985,142 @@ void MergeRGBPlane(const uint8_t* src_r, } } +LIBYUV_NOINLINE +void SplitARGBPlaneAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { + int y; + void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, uint8_t* dst_a, int width) = + SplitARGBRow_C; + + assert(height > 0); + + if (src_stride_argb == width * 4 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = + dst_stride_a = 0; + } + +#if defined(HAS_SPLITARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitARGBRow = SplitARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + SplitARGBRow = SplitARGBRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitARGBRow = SplitARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + SplitARGBRow = SplitARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitARGBRow = SplitARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitARGBRow = SplitARGBRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitARGBRow = SplitARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitARGBRow = SplitARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + dst_a += dst_stride_a; + src_argb += src_stride_argb; + } +} + +LIBYUV_NOINLINE +void SplitARGBPlaneOpaque(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { + int y; + void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, int width) = SplitXRGBRow_C; + assert(height > 0); + + if (src_stride_argb == width * 4 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0; + } + +#if defined(HAS_SPLITXRGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitXRGBRow = SplitXRGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + SplitXRGBRow = SplitXRGBRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitXRGBRow = SplitXRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + SplitXRGBRow = SplitXRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitXRGBRow = SplitXRGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + SplitXRGBRow = SplitXRGBRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITXRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitXRGBRow = SplitXRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitXRGBRow = SplitXRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + src_argb += src_stride_argb; + } +} + LIBYUV_API void SplitARGBPlane(const uint8_t* src_argb, int src_stride_argb, @@ -995,137 +1134,146 @@ void SplitARGBPlane(const uint8_t* src_argb, int dst_stride_a, int width, int height) { - int y; - void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, - uint8_t* dst_b, uint8_t* dst_a, int width) = - SplitARGBRow_C; - void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, - uint8_t* dst_b, int width) = SplitXRGBRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_r = dst_r + (height - 1) * dst_stride_r; + dst_g = dst_g + (height - 1) * dst_stride_g; + dst_b = dst_b + (height - 1) * dst_stride_b; + dst_a = dst_a + (height - 1) * dst_stride_a; + dst_stride_r = -dst_stride_r; + dst_stride_g = -dst_stride_g; + dst_stride_b = -dst_stride_b; + dst_stride_a = -dst_stride_a; + } if (dst_a == NULL) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_r = dst_r + (height - 1) * dst_stride_r; - dst_g = dst_g + (height - 1) * dst_stride_g; - dst_b = dst_b + (height - 1) * dst_stride_b; - dst_stride_r = -dst_stride_r; - dst_stride_g = -dst_stride_g; - dst_stride_b = -dst_stride_b; - } - - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_r == width && - dst_stride_g == width && dst_stride_b == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = - dst_stride_a = 0; - } + SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, + dst_stride_g, dst_b, dst_stride_b, width, height); + } else { + SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, + dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a, + width, height); + } +} -#if defined(HAS_SPLITARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SplitXRGBRow = SplitXRGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - SplitXRGBRow = SplitXRGBRow_SSE2; - } - } -#endif -#if defined(HAS_SPLITARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SplitXRGBRow = SplitXRGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - SplitXRGBRow = SplitXRGBRow_SSSE3; - } +LIBYUV_NOINLINE +void MergeARGBPlaneAlpha(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, const uint8_t* src_a, + uint8_t* dst_argb, int width) = MergeARGBRow_C; + + assert(height > 0); + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_argb = 0; + } +#if defined(HAS_MERGEARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeARGBRow = MergeARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + MergeARGBRow = MergeARGBRow_SSE2; } + } #endif -#if defined(HAS_SPLITARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitXRGBRow = SplitXRGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - SplitXRGBRow = SplitXRGBRow_AVX2; - } +#if defined(HAS_MERGEARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeARGBRow = MergeARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeARGBRow = MergeARGBRow_AVX2; } + } #endif -#if defined(HAS_SPLITRGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitXRGBRow = SplitXRGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitXRGBRow = SplitXRGBRow_NEON; - } +#if defined(HAS_MERGEARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeARGBRow = MergeARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeARGBRow = MergeARGBRow_NEON; } + } #endif - for (y = 0; y < height; ++y) { - SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); - dst_r += dst_stride_r; - dst_g += dst_stride_g; - dst_b += dst_stride_b; - src_argb += src_stride_argb; - } - } else { - if (height < 0) { - height = -height; - dst_r = dst_r + (height - 1) * dst_stride_r; - dst_g = dst_g + (height - 1) * dst_stride_g; - dst_b = dst_b + (height - 1) * dst_stride_b; - dst_a = dst_a + (height - 1) * dst_stride_a; - dst_stride_r = -dst_stride_r; - dst_stride_g = -dst_stride_g; - dst_stride_b = -dst_stride_b; - dst_stride_a = -dst_stride_a; - } - - if (src_stride_argb == width * 4 && dst_stride_r == width && - dst_stride_g == width && dst_stride_b == width && - dst_stride_a == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = - dst_stride_a = 0; - } + for (y = 0; y < height; ++y) { + MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + src_a += src_stride_a; + dst_argb += dst_stride_argb; + } +} -#if defined(HAS_SPLITARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - SplitARGBRow = SplitARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - SplitARGBRow = SplitARGBRow_SSE2; - } - } -#endif -#if defined(HAS_SPLITARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SplitARGBRow = SplitARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - SplitARGBRow = SplitARGBRow_SSSE3; - } +LIBYUV_NOINLINE +void MergeARGBPlaneOpaque(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, uint8_t* dst_argb, int width) = + MergeXRGBRow_C; + + assert(height > 0); + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; + } +#if defined(HAS_MERGEXRGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeXRGBRow = MergeXRGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + MergeXRGBRow = MergeXRGBRow_SSE2; } + } #endif -#if defined(HAS_SPLITARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - SplitARGBRow = SplitARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - SplitARGBRow = SplitARGBRow_AVX2; - } +#if defined(HAS_MERGEXRGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXRGBRow = MergeXRGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXRGBRow = MergeXRGBRow_AVX2; } + } #endif -#if defined(HAS_SPLITRGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitARGBRow = SplitARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - SplitARGBRow = SplitARGBRow_NEON; - } +#if defined(HAS_MERGEXRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXRGBRow = MergeXRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeXRGBRow = MergeXRGBRow_NEON; } + } #endif - for (y = 0; y < height; ++y) { - SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); - dst_r += dst_stride_r; - dst_g += dst_stride_g; - dst_b += dst_stride_b; - dst_a += dst_stride_a; - src_argb += src_stride_argb; - } + for (y = 0; y < height; ++y) { + MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_argb += dst_stride_argb; } } @@ -1142,107 +1290,357 @@ void MergeARGBPlane(const uint8_t* src_r, int dst_stride_argb, int width, int height) { - int y; - void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g, - const uint8_t* src_b, const uint8_t* src_a, - uint8_t* dst_argb, int width) = MergeARGBRow_C; - void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g, - const uint8_t* src_b, uint8_t* dst_argb, int width) = - MergeXRGBRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } if (src_a == NULL) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_r == width && src_stride_g == width && - src_stride_b == width && dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; + MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_argb, dst_stride_argb, width, + height); + } else { + MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_argb, + dst_stride_argb, width, height); + } +} + +// TODO(yuan): Support 2 bit alpha channel. +LIBYUV_API +void MergeXR30Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height, + int depth) { + int y; + void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint8_t* dst_ar30, int depth, + int width) = MergeXR30Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0; + } +#if defined(HAS_MERGEXR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXR30Row = MergeXR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXR30Row = MergeXR30Row_AVX2; } -#if defined(HAS_MERGEARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeXRGBRow = MergeXRGBRow_Any_SSE2; + } +#endif +#if defined(HAS_MERGEXR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (depth == 10) { + MergeXR30Row = MergeXR30Row_10_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR30Row = MergeXR30Row_10_NEON; + } + } else { + MergeXR30Row = MergeXR30Row_Any_NEON; if (IS_ALIGNED(width, 8)) { - MergeXRGBRow = MergeXRGBRow_SSE2; + MergeXR30Row = MergeXR30Row_NEON; } } + } #endif -#if defined(HAS_MERGEARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeXRGBRow = MergeXRGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeXRGBRow = MergeXRGBRow_AVX2; - } + + for (y = 0; y < height; ++y) { + MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar30 += dst_stride_ar30; + } +} + +LIBYUV_NOINLINE +static void MergeAR64PlaneAlpha(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + int y; + void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, const uint16_t* src_a, + uint16_t* dst_argb, int depth, int width) = + MergeAR64Row_C; + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_ar64 = 0; + } +#if defined(HAS_MERGEAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeAR64Row = MergeAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeAR64Row = MergeAR64Row_AVX2; } + } #endif -#if defined(HAS_MERGERGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeXRGBRow = MergeXRGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - MergeXRGBRow = MergeXRGBRow_NEON; - } +#if defined(HAS_MERGEAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeAR64Row = MergeAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeAR64Row = MergeAR64Row_NEON; } + } #endif - for (y = 0; y < height; ++y) { - MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - dst_argb += dst_stride_argb; + for (y = 0; y < height; ++y) { + MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + src_a += src_stride_a; + dst_ar64 += dst_stride_ar64; + } +} + +LIBYUV_NOINLINE +static void MergeAR64PlaneOpaque(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + int y; + void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint16_t* dst_argb, int depth, + int width) = MergeXR64Row_C; + + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0; + } +#if defined(HAS_MERGEXR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXR64Row = MergeXR64Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXR64Row = MergeXR64Row_AVX2; } - } else { - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; + } +#endif +#if defined(HAS_MERGEXR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXR64Row = MergeXR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR64Row = MergeXR64Row_NEON; } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar64 += dst_stride_ar64; + } +} + +LIBYUV_API +void MergeAR64Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64; + dst_stride_ar64 = -dst_stride_ar64; + } + + if (src_a == NULL) { + MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_ar64, dst_stride_ar64, width, height, + depth); + } else { + MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_ar64, + dst_stride_ar64, width, height, depth); + } +} - if (src_stride_r == width && src_stride_g == width && - src_stride_b == width && src_stride_a == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_r = src_stride_g = src_stride_b = src_stride_a = - dst_stride_argb = 0; +LIBYUV_NOINLINE +static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + int y; + void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, const uint16_t* src_a, + uint8_t* dst_argb, int depth, int width) = + MergeARGB16To8Row_C; + + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + src_stride_a == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_argb = 0; + } +#if defined(HAS_MERGEARGB16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeARGB16To8Row = MergeARGB16To8Row_AVX2; } -#if defined(HAS_MERGEARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MergeARGBRow = MergeARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - MergeARGBRow = MergeARGBRow_SSE2; - } + } +#endif +#if defined(HAS_MERGEARGB16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeARGB16To8Row = MergeARGB16To8Row_NEON; } + } #endif -#if defined(HAS_MERGEARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - MergeARGBRow = MergeARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - MergeARGBRow = MergeARGBRow_AVX2; - } + + for (y = 0; y < height; ++y) { + MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + src_a += src_stride_a; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_NOINLINE +static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + int y; + void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint8_t* dst_argb, int depth, + int width) = MergeXRGB16To8Row_C; + + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; + } +#if defined(HAS_MERGEXRGB16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2; } + } #endif -#if defined(HAS_MERGERGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - MergeARGBRow = MergeARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - MergeARGBRow = MergeARGBRow_NEON; - } +#if defined(HAS_MERGEXRGB16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_NEON; } + } #endif - for (y = 0; y < height; ++y) { - MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); - src_r += src_stride_r; - src_g += src_stride_g; - src_b += src_stride_b; - dst_argb += dst_stride_argb; - } + for (y = 0; y < height; ++y) { + MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_argb += dst_stride_argb; + } +} + +LIBYUV_API +void MergeARGB16To8Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + if (src_a == NULL) { + MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, dst_argb, dst_stride_argb, width, + height, depth); + } else { + MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, + src_stride_b, src_a, src_stride_a, dst_argb, + dst_stride_argb, width, height, depth); } } @@ -2244,12 +2642,12 @@ int ARGBAdd(const uint8_t* src_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__)) +#if defined(HAS_ARGBADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_SSE2; } #endif -#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__)) +#if defined(HAS_ARGBADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { @@ -3527,6 +3925,76 @@ int ARGBShuffle(const uint8_t* src_bgra, return 0; } +// Shuffle AR64 channel order. e.g. AR64 to AB64. +LIBYUV_API +int AR64Shuffle(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ar64, + int dst_stride_ar64, + const uint8_t* shuffler, + int width, + int height) { + int y; + void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64, + const uint8_t* shuffler, int width) = AR64ShuffleRow_C; + if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; + src_stride_ar64 = -src_stride_ar64; + } + // Coalesce rows. + if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_ar64 = dst_stride_ar64 = 0; + } + // Assembly versions can be reused if it's implemented with shuffle. +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + AR64ShuffleRow = ARGBShuffleRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AR64ShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + AR64ShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AR64ShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + AR64ShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + AR64ShuffleRow = ARGBShuffleRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + AR64ShuffleRow = ARGBShuffleRow_MMI; + } + } +#endif + + for (y = 0; y < height; ++y) { + AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler, + width * 2); + src_ar64 += src_stride_ar64; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + // Gauss blur a float plane using Gaussian 5x5 filter with // coefficients of 1, 4, 6, 4, 1. // Each destination pixel is a blur of the 5x5 diff --git a/third_party/libyuv/source/rotate_gcc.cc b/third_party/libyuv/source/rotate_gcc.cc index fd359d4ae6..1a3f8cbbda 100644 --- a/third_party/libyuv/source/rotate_gcc.cc +++ b/third_party/libyuv/source/rotate_gcc.cc @@ -17,8 +17,7 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. #if defined(HAS_TRANSPOSEWX8_SSSE3) diff --git a/third_party/libyuv/source/rotate_win.cc b/third_party/libyuv/source/rotate_win.cc index e887dd525c..a78873f843 100644 --- a/third_party/libyuv/source/rotate_win.cc +++ b/third_party/libyuv/source/rotate_win.cc @@ -16,8 +16,9 @@ namespace libyuv { extern "C" { #endif -// This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc index 08ae1d2af7..c9a402eda2 100644 --- a/third_party/libyuv/source/row_any.cc +++ b/third_party/libyuv/source/row_any.cc @@ -61,6 +61,8 @@ ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15) ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) #endif +// Note that odd width replication includes 444 due to implementation +// on arm that subsamples 444 to 422 internally. // Any 4 planes to 1 with yuvconstants #define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ @@ -77,6 +79,10 @@ ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 192, a_buf + n, r); \ + if (width & 1) { \ + temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ + temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + } \ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ yuvconstants, MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ @@ -115,6 +121,124 @@ ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7) #endif #undef ANY41C +// Any 4 planes to 1 plane of 8 bit with yuvconstants +#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 4]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210ALPHATOARGBROW_SSSE3 +ANY41CT(I210AlphaToARGBRow_Any_SSSE3, + I210AlphaToARGBRow_SSSE3, + 1, + 0, + uint16_t, + 2, + 4, + 7) +#endif + +#ifdef HAS_I210ALPHATOARGBROW_AVX2 +ANY41CT(I210AlphaToARGBRow_Any_AVX2, + I210AlphaToARGBRow_AVX2, + 1, + 0, + uint16_t, + 2, + 4, + 15) +#endif + +#ifdef HAS_I410ALPHATOARGBROW_SSSE3 +ANY41CT(I410AlphaToARGBRow_Any_SSSE3, + I410AlphaToARGBRow_SSSE3, + 0, + 0, + uint16_t, + 2, + 4, + 7) +#endif + +#ifdef HAS_I410ALPHATOARGBROW_AVX2 +ANY41CT(I410AlphaToARGBRow_Any_AVX2, + I410AlphaToARGBRow_AVX2, + 0, + 0, + uint16_t, + 2, + 4, + 15) +#endif + +#undef ANY41CT + +// Any 4 planes to 1 plane with parameter +#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ + void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ + const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \ + SIMD_ALIGNED(STYPE temp[16 * 4]); \ + SIMD_ALIGNED(DTYPE out[64]); \ + memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \ + } \ + memcpy(temp, r_buf + n, r * SBPP); \ + memcpy(temp + 16, g_buf + n, r * SBPP); \ + memcpy(temp + 32, b_buf + n, r * SBPP); \ + memcpy(temp + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \ + memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_MERGEAR64ROW_AVX2 +ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) +#endif + +#ifdef HAS_MERGEAR64ROW_NEON +ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_AVX2 +ANY41PT(MergeARGB16To8Row_Any_AVX2, + MergeARGB16To8Row_AVX2, + uint16_t, + 2, + uint8_t, + 4, + 15) +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_NEON +ANY41PT(MergeARGB16To8Row_Any_NEON, + MergeARGB16To8Row_NEON, + uint16_t, + 2, + uint8_t, + 4, + 7) +#endif + +#undef ANY41PT + // Any 3 planes to 1. #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ @@ -144,13 +268,13 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) #ifdef HAS_MERGERGBROW_MMI ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7) #endif -#ifdef HAS_MERGEARGBROW_SSE2 +#ifdef HAS_MERGEXRGBROW_SSE2 ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7) #endif -#ifdef HAS_MERGEARGBROW_AVX2 +#ifdef HAS_MERGEXRGBROW_AVX2 ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15) #endif -#ifdef HAS_MERGEARGBROW_NEON +#ifdef HAS_MERGEXRGBROW_NEON ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15) #endif #ifdef HAS_I422TOYUY2ROW_SSE2 @@ -327,11 +451,99 @@ ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #ifdef HAS_I210TOAR30ROW_AVX2 ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif +#ifdef HAS_I410TOAR30ROW_SSSE3 +ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOARGBROW_SSSE3 +ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOARGBROW_AVX2 +ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I410TOAR30ROW_AVX2 +ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif #ifdef HAS_I210TOARGBROW_MMI ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7) #endif +#ifdef HAS_I212TOAR30ROW_SSSE3 +ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOARGBROW_SSSE3 +ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I212TOARGBROW_AVX2 +ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I212TOAR30ROW_AVX2 +ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif #undef ANY31CT +// Any 3 planes to 1 plane with parameter +#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ + void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ + DTYPE* dst_ptr, int depth, int width) { \ + SIMD_ALIGNED(STYPE temp[16 * 3]); \ + SIMD_ALIGNED(DTYPE out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \ + } \ + memcpy(temp, r_buf + n, r * SBPP); \ + memcpy(temp + 16, g_buf + n, r * SBPP); \ + memcpy(temp + 32, b_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \ + memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_MERGEXR30ROW_AVX2 +ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15) +#endif + +#ifdef HAS_MERGEXR30ROW_NEON +ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3) +ANY31PT(MergeXR30Row_10_Any_NEON, + MergeXR30Row_10_NEON, + uint16_t, + 2, + uint8_t, + 4, + 3) +#endif + +#ifdef HAS_MERGEXR64ROW_AVX2 +ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) +#endif + +#ifdef HAS_MERGEXR64ROW_NEON +ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 +ANY31PT(MergeXRGB16To8Row_Any_AVX2, + MergeXRGB16To8Row_AVX2, + uint16_t, + 2, + uint8_t, + 4, + 15) +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_NEON +ANY31PT(MergeXRGB16To8Row_Any_NEON, + MergeXRGB16To8Row_NEON, + uint16_t, + 2, + uint8_t, + 4, + 7) +#endif + +#undef ANY31PT + // Any 2 planes to 1. #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ @@ -546,12 +758,57 @@ ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7) #endif #undef ANY21C +// Any 2 planes of 16 bit to 1 with yuvconstants +#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(T temp[16 * 3]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \ + ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_P210TOAR30ROW_SSSE3 +ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_SSSE3 +ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_AVX2 +ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P210TOAR30ROW_AVX2 +ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P410TOAR30ROW_SSSE3 +ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_SSSE3 +ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_AVX2 +ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P410TOAR30ROW_AVX2 +ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif + +#undef ANY21CT + // Any 2 16 bit planes with parameter to 1 #define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \ int width) { \ SIMD_ALIGNED(T temp[16 * 4]); \ - memset(temp, 0, 16 * 4); /* for msan */ \ + memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ @@ -1100,6 +1357,72 @@ ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1) #undef ANY11P #undef ANY11P +// Any 1 to 1 with type +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]); \ + SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]); \ + memset(temp, 0, (MASK + 1) * SBPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \ + ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1); \ + memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_ARGBTOAR64ROW_SSSE3 +ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) +#endif + +#ifdef HAS_ARGBTOAB64ROW_SSSE3 +ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) +#endif + +#ifdef HAS_AR64TOARGBROW_SSSE3 +ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) +#endif + +#ifdef HAS_ARGBTOAR64ROW_SSSE3 +ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) +#endif + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_ARGBTOAB64ROW_AVX2 +ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_AR64TOARGBROW_AVX2 +ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_NEON +ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_ARGBTOAB64ROW_NEON +ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_AR64TOARGBROW_NEON +ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_NEON +ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) +#endif + +#undef ANY11T + // Any 1 to 1 with parameter and shorts. BPP measures in shorts. #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ @@ -1266,38 +1589,38 @@ ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7) #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \ - ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \ - SIMD_ALIGNED(uint8_t temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, \ + int width, int source_y_fraction) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } #ifdef HAS_INTERPOLATEROW_AVX2 -ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) +ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_SSSE3 -ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) +ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_NEON -ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) +ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_MSA -ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) +ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_MMI -ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) +ANY11I(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) #endif -#undef ANY11T +#undef ANY11I // Any 1 to 1 mirror. #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ @@ -1508,16 +1831,16 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) #ifdef HAS_SPLITRGBROW_MMI ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) #endif -#ifdef HAS_SPLITARGBROW_SSE2 +#ifdef HAS_SPLITXRGBROW_SSE2 ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7) #endif -#ifdef HAS_SPLITARGBROW_SSSE3 +#ifdef HAS_SPLITXRGBROW_SSSE3 ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7) #endif -#ifdef HAS_SPLITARGBROW_AVX2 +#ifdef HAS_SPLITXRGBROW_AVX2 ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15) #endif -#ifdef HAS_SPLITARGBROW_NEON +#ifdef HAS_SPLITXRGBROW_NEON ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) #endif @@ -1557,17 +1880,17 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ uint8_t* dst_v, int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 4]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ + ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ @@ -1714,17 +2037,17 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) // Any 1 to 1 with source stride (2 rows of source). Outputs UV plane. // 128 byte row allows for 32 avx ARGB pixels. #define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \ int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 3]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \ + ANY_SIMD(src_ptr, src_stride, dst_vu, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc index a941c3f5fc..c6e412414e 100644 --- a/third_party/libyuv/source/row_common.cc +++ b/third_party/libyuv/source/row_common.cc @@ -10,6 +10,7 @@ #include "libyuv/row.h" +#include <assert.h> #include <stdio.h> #include <string.h> // For memcpy and memset. @@ -21,10 +22,14 @@ namespace libyuv { extern "C" { #endif -// The following ifdef from row_win makes the C code match the row_win code, -// which is 7 bit fixed point. +// This macro control YUV to RGB using unsigned math to extend range of +// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: +// LIBYUV_UNLIMITED_DATA + +// The following macro from row_win makes the C code match the row_win code, +// which is 7 bit fixed point for ARGBToI420: #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) + !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) #define LIBYUV_RGB7 1 #endif @@ -50,6 +55,11 @@ static __inline int32_t clamp1023(int32_t v) { return (-(v >= 1023) | v) & 1023; } +// clamp to max +static __inline int32_t ClampMax(int32_t v, int32_t max) { + return (-(v >= max) | v) & max; +} + static __inline uint32_t Abs(int32_t v) { int m = -(v < 0); return (v + m) ^ m; @@ -67,6 +77,10 @@ static __inline int32_t clamp1023(int32_t v) { return (v > 1023) ? 1023 : v; } +static __inline int32_t ClampMax(int32_t v, int32_t max) { + return (v > max) ? max : v; +} + static __inline uint32_t Abs(int32_t v) { return (v < 0) ? -v : v; } @@ -413,6 +427,82 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { } } +void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_ar64[0] = src_argb[0] * 0x0101; + dst_ar64[1] = src_argb[1] * 0x0101; + dst_ar64[2] = src_argb[2] * 0x0101; + dst_ar64[3] = src_argb[3] * 0x0101; + dst_ar64 += 4; + src_argb += 4; + } +} + +void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_ab64[0] = src_argb[2] * 0x0101; + dst_ab64[1] = src_argb[1] * 0x0101; + dst_ab64[2] = src_argb[0] * 0x0101; + dst_ab64[3] = src_argb[3] * 0x0101; + dst_ab64 += 4; + src_argb += 4; + } +} + +void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_ar64[0] >> 8; + dst_argb[1] = src_ar64[1] >> 8; + dst_argb[2] = src_ar64[2] >> 8; + dst_argb[3] = src_ar64[3] >> 8; + dst_argb += 4; + src_ar64 += 4; + } +} + +void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_ab64[2] >> 8; + dst_argb[1] = src_ab64[1] >> 8; + dst_argb[2] = src_ab64[0] >> 8; + dst_argb[3] = src_ab64[3] >> 8; + dst_argb += 4; + src_ab64 += 4; + } +} + +// TODO(fbarchard): Make shuffle compatible with SIMD versions +void AR64ShuffleRow_C(const uint8_t* src_ar64, + uint8_t* dst_ar64, + const uint8_t* shuffler, + int width) { + const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64; + uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64; + int index0 = shuffler[0] / 2; + int index1 = shuffler[2] / 2; + int index2 = shuffler[4] / 2; + int index3 = shuffler[6] / 2; + // Shuffle a row of AR64. + int x; + for (x = 0; x < width / 2; ++x) { + // To support in-place conversion. + uint16_t b = src_ar64_16[index0]; + uint16_t g = src_ar64_16[index1]; + uint16_t r = src_ar64_16[index2]; + uint16_t a = src_ar64_16[index3]; + dst_ar64_16[0] = b; + dst_ar64_16[1] = g; + dst_ar64_16[2] = r; + dst_ar64_16[3] = a; + src_ar64_16 += 4; + dst_ar64_16 += 4; + } +} + #ifdef LIBYUV_RGB7 // Old 7 bit math for compatibility on unsupported platforms. static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { @@ -462,80 +552,80 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { // Intel version mimic SSE/AVX which does 2 pavgb #if LIBYUV_ARGBTOUV_PAVGB -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ } #else // ARM version does sum / 2 then multiply by 2x smaller coefficients -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ - uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ - uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint16_t ab = src_rgb0[B] + src_rgb1[B]; \ - uint16_t ag = src_rgb0[G] + src_rgb1[G]; \ - uint16_t ar = src_rgb0[R] + src_rgb1[R]; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ - } \ +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = src_rgb[B] + src_rgb1[B]; \ + uint16_t ag = src_rgb[G] + src_rgb1[G]; \ + uint16_t ar = src_rgb[R] + src_rgb1[R]; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + } \ } #endif @@ -603,80 +693,80 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { // ARGBToYJ_C and ARGBToUVJ_C // Intel version mimic SSE/AVX which does 2 pavgb #if LIBYUV_ARGBTOUV_PAVGB -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ } #else // ARM version does sum / 2 then multiply by 2x smaller coefficients -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ - uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ - uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \ - uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \ - uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ - } \ +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ + uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ + uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + } \ } #endif @@ -1146,16 +1236,16 @@ void ARGBShadeRow_C(const uint8_t* src_argb, #define REPEAT8(v) (v) | ((v) << 8) #define SHADE(f, v) v* f >> 16 -void ARGBMultiplyRow_C(const uint8_t* src_argb0, +void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const uint32_t b = REPEAT8(src_argb0[0]); - const uint32_t g = REPEAT8(src_argb0[1]); - const uint32_t r = REPEAT8(src_argb0[2]); - const uint32_t a = REPEAT8(src_argb0[3]); + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); const uint32_t b_scale = src_argb1[0]; const uint32_t g_scale = src_argb1[1]; const uint32_t r_scale = src_argb1[2]; @@ -1164,7 +1254,7 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); dst_argb[3] = SHADE(a, a_scale); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -1174,16 +1264,16 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0, #define SHADE(f, v) clamp255(v + f) -void ARGBAddRow_C(const uint8_t* src_argb0, +void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const int b = src_argb0[0]; - const int g = src_argb0[1]; - const int r = src_argb0[2]; - const int a = src_argb0[3]; + const int b = src_argb[0]; + const int g = src_argb[1]; + const int r = src_argb[2]; + const int a = src_argb[3]; const int b_add = src_argb1[0]; const int g_add = src_argb1[1]; const int r_add = src_argb1[2]; @@ -1192,7 +1282,7 @@ void ARGBAddRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_add); dst_argb[2] = SHADE(r, r_add); dst_argb[3] = SHADE(a, a_add); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -1201,16 +1291,16 @@ void ARGBAddRow_C(const uint8_t* src_argb0, #define SHADE(f, v) clamp0(f - v) -void ARGBSubtractRow_C(const uint8_t* src_argb0, +void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const int b = src_argb0[0]; - const int g = src_argb0[1]; - const int r = src_argb0[2]; - const int a = src_argb0[3]; + const int b = src_argb[0]; + const int g = src_argb[1]; + const int r = src_argb[2]; + const int a = src_argb[3]; const int b_sub = src_argb1[0]; const int g_sub = src_argb1[1]; const int r_sub = src_argb1[2]; @@ -1219,7 +1309,7 @@ void ARGBSubtractRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_sub); dst_argb[2] = SHADE(r, r_sub); dst_argb[3] = SHADE(a, a_sub); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -1329,64 +1419,36 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // Macros to create SIMD specific yuv to rgb conversion constants. -#if defined(__aarch64__) -#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \ - const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \ - {UB, VR, UB, VR, UB, VR, UB, VR}, {UB, VR, UB, VR, UB, VR, UB, VR}, \ - {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, \ - {BB, BG, BR, YB, 0, 0, 0, 0}, {0x0101 * YG, YG, 0, 0}}; \ - const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \ - {VR, UB, VR, UB, VR, UB, VR, UB}, {VR, UB, VR, UB, VR, UB, VR, UB}, \ - {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, \ - {BR, BG, BB, YB, 0, 0, 0, 0}, {0x0101 * YG, YG, 0, 0}}; - -#elif defined(__arm__) -#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \ - const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \ - {UB, UB, UB, UB, VR, VR, VR, VR, 0, 0, 0, 0, 0, 0, 0, 0}, \ - {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, \ - {BB, BG, BR, YB, 0, 0, 0, 0}, \ - {0x0101 * YG, YG, 0, 0}}; \ - const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \ - {VR, VR, VR, VR, UB, UB, UB, UB, 0, 0, 0, 0, 0, 0, 0, 0}, \ - {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, \ - {BR, BG, BB, YB, 0, 0, 0, 0}, \ - {0x0101 * YG, YG, 0, 0}}; +// clang-format off +#if defined(__aarch64__) || defined(__arm__) +// Bias values include subtract 128 from U and V, bias from Y and rounding. +// For B and R bias is negative. For G bias is positive. +#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ + {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \ + 0, 0}} #else -#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \ - const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \ - {-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, \ - -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0}, \ - {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ - {0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, \ - 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR}, \ - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \ - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \ - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \ - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ - {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}; \ - const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \ - {-VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, \ - -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0}, \ - {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, \ - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, \ - {0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, \ - 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB}, \ - {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \ - {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \ - {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \ - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ - {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}; +#define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ + {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ + {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} #endif -// TODO(fbarchard): Generate SIMD structures from float matrix. +// clang-format on -// Bias values to round, and subtract 128 from U and V. -#define BB (-UB * 128 + YB) -#define BG (UG * 128 + VG * 128 + YB) -#define BR (-VR * 128 + YB) +#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \ + const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ + YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \ + const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ + YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB); + +// TODO(fbarchard): Generate SIMD structures from float matrix. // BT.601 limited range YUV to RGB reference // R = (Y - 16) * 1.164 + V * 1.596 @@ -1395,7 +1457,11 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // KR = 0.299; KB = 0.114 // U and V contributions to R,G,B. +#ifdef LIBYUV_UNLIMITED_DATA +#define UB 129 /* round(2.018 * 64) */ +#else #define UB 128 /* max(128, round(2.018 * 64)) */ +#endif #define UG 25 /* round(0.391 * 64) */ #define VG 52 /* round(0.813 * 64) */ #define VR 102 /* round(1.596 * 64) */ @@ -1404,7 +1470,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ -MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1429,7 +1495,7 @@ MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ -MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1444,9 +1510,12 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR) // B = (Y - 16) * 1.164 + U * 2.112 // KR = 0.2126, KB = 0.0722 -// TODO(fbarchard): Find way to express 2.112 instead of 2.0. // U and V contributions to R,G,B. +#ifdef LIBYUV_UNLIMITED_DATA +#define UB 135 /* round(2.112 * 64) */ +#else #define UB 128 /* max(128, round(2.112 * 64)) */ +#endif #define UG 14 /* round(0.213 * 64) */ #define VG 34 /* round(0.533 * 64) */ #define VR 115 /* round(1.793 * 64) */ @@ -1455,7 +1524,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ -MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1480,7 +1549,7 @@ MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ -MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1495,9 +1564,12 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR) // B = (Y - 16) * 1.164384 + U * 2.14177 // KR = 0.2627; KB = 0.0593 -// TODO(fbarchard): Improve accuracy; the B channel is off by 7%. // U and V contributions to R,G,B. +#ifdef LIBYUV_UNLIMITED_DATA +#define UB 137 /* round(2.142 * 64) */ +#else #define UB 128 /* max(128, round(2.142 * 64)) */ +#endif #define UG 12 /* round(0.187326 * 64) */ #define VG 42 /* round(0.65042 * 64) */ #define VR 107 /* round(1.67867 * 64) */ @@ -1506,7 +1578,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ -MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1530,7 +1602,7 @@ MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ -MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) +MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) #undef YG #undef YB @@ -1545,6 +1617,42 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) #undef MAKEYUVCONSTANTS +#if defined(__aarch64__) || defined(__arm__) +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVCoeff[0]; \ + int vr = yuvconstants->kUVCoeff[1]; \ + int ug = yuvconstants->kUVCoeff[2]; \ + int vg = yuvconstants->kUVCoeff[3]; \ + int yg = yuvconstants->kRGBCoeffBias[0]; \ + int bb = yuvconstants->kRGBCoeffBias[1]; \ + int bg = yuvconstants->kRGBCoeffBias[2]; \ + int br = yuvconstants->kRGBCoeffBias[3] + +#define CALC_RGB16 \ + int32_t y1 = (uint32_t)(y32 * yg) >> 16; \ + int b16 = y1 + (u * ub) - bb; \ + int g16 = y1 + bg - (u * ug + v * vg); \ + int r16 = y1 + (v * vr) - br +#else +#define LOAD_YUV_CONSTANTS \ + int ub = yuvconstants->kUVToB[0]; \ + int ug = yuvconstants->kUVToG[0]; \ + int vg = yuvconstants->kUVToG[1]; \ + int vr = yuvconstants->kUVToR[1]; \ + int yg = yuvconstants->kYToRgb[0]; \ + int yb = yuvconstants->kYBiasToRgb[0] + +#define CALC_RGB16 \ + int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \ + int8_t ui = u; \ + int8_t vi = v; \ + ui -= 0x80; \ + vi -= 0x80; \ + int b16 = y1 + (ui * ub); \ + int g16 = y1 - (ui * ug + vi * vg); \ + int r16 = y1 + (vi * vr) +#endif + // C reference code that mimics the YUV assembly. // Reads 8 bit YUV and leaves result as 16 bit. static __inline void YuvPixel(uint8_t y, @@ -1554,39 +1662,12 @@ static __inline void YuvPixel(uint8_t y, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = -yuvconstants->kUVToRB[1]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[1]; -#elif defined(__arm__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[4]; - int vr = -yuvconstants->kUVToRB[4]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[1]; -#else - int ub = yuvconstants->kUVToB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = yuvconstants->kUVToR[1]; - int bb = yuvconstants->kUVBiasB[0]; - int bg = yuvconstants->kUVBiasG[0]; - int br = yuvconstants->kUVBiasR[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - - uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6); - *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6); - *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6); + LOAD_YUV_CONSTANTS; + uint32_t y32 = y * 0x0101; + CALC_RGB16; + *b = Clamp((int32_t)(b16) >> 6); + *g = Clamp((int32_t)(g16) >> 6); + *r = Clamp((int32_t)(r16) >> 6); } // Reads 8 bit YUV and leaves result as 16 bit. @@ -1597,85 +1678,50 @@ static __inline void YuvPixel8_16(uint8_t y, int* g, int* r, const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = -yuvconstants->kUVToRB[1]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[1]; -#elif defined(__arm__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[4]; - int vr = -yuvconstants->kUVToRB[4]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[1]; -#else - int ub = yuvconstants->kUVToB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = yuvconstants->kUVToR[1]; - int bb = yuvconstants->kUVBiasB[0]; - int bg = yuvconstants->kUVBiasG[0]; - int br = yuvconstants->kUVBiasR[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - - uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; - *b = (int)(-(u * ub) + y1 + bb); - *g = (int)(-(u * ug + v * vg) + y1 + bg); - *r = (int)(-(v * vr) + y1 + br); + LOAD_YUV_CONSTANTS; + uint32_t y32 = y * 0x0101; + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } // C reference code that mimics the YUV 16 bit assembly. // Reads 10 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel16(int16_t y, - int16_t u, - int16_t v, - int* b, - int* g, - int* r, - const struct YuvConstants* yuvconstants) { -#if defined(__aarch64__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = -yuvconstants->kUVToRB[1]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[1]; -#elif defined(__arm__) - int ub = -yuvconstants->kUVToRB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[4]; - int vr = -yuvconstants->kUVToRB[4]; - int bb = yuvconstants->kUVBiasBGR[0]; - int bg = yuvconstants->kUVBiasBGR[1]; - int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[1]; -#else - int ub = yuvconstants->kUVToB[0]; - int ug = yuvconstants->kUVToG[0]; - int vg = yuvconstants->kUVToG[1]; - int vr = yuvconstants->kUVToR[1]; - int bb = yuvconstants->kUVBiasB[0]; - int bg = yuvconstants->kUVBiasG[0]; - int br = yuvconstants->kUVBiasR[0]; - int yg = yuvconstants->kYToRgb[0]; -#endif - - uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16; +static __inline void YuvPixel10_16(uint16_t y, + uint16_t u, + uint16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y << 6; u = clamp255(u >> 2); v = clamp255(v >> 2); - *b = (int)(-(u * ub) + y1 + bb); - *g = (int)(-(u * ug + v * vg) + y1 + bg); - *r = (int)(-(v * vr) + y1 + br); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 12 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel12_16(int16_t y, + int16_t u, + int16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y << 4; + u = clamp255(u >> 4); + v = clamp255(v >> 4); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; } // C reference code that mimics the YUV 10 bit assembly. @@ -1690,22 +1736,78 @@ static __inline void YuvPixel10(uint16_t y, int b16; int g16; int r16; - YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants); + YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = Clamp(b16 >> 6); + *g = Clamp(g16 >> 6); + *r = Clamp(r16 >> 6); +} + +// C reference code that mimics the YUV 12 bit assembly. +// Reads 12 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel12(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants); *b = Clamp(b16 >> 6); *g = Clamp(g16 >> 6); *r = Clamp(r16 >> 6); } +// C reference code that mimics the YUV 16 bit assembly. +// Reads 16 bit YUV and leaves result as 8 bit. +static __inline void YuvPixel16_8(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y; + u = clamp255(u >> 8); + v = clamp255(v >> 8); + CALC_RGB16; + *b = Clamp((int32_t)(b16) >> 6); + *g = Clamp((int32_t)(g16) >> 6); + *r = Clamp((int32_t)(r16) >> 6); +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 16 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel16_16(uint16_t y, + uint16_t u, + uint16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { + LOAD_YUV_CONSTANTS; + uint32_t y32 = y; + u = clamp255(u >> 8); + v = clamp255(v >> 8); + CALC_RGB16; + *b = b16; + *g = g16; + *r = r16; +} + // C reference code that mimics the YUV assembly. -// Reads 8 bit YUV and leaves result as 16 bit. +// Reads 8 bit YUV and leaves result as 8 bit. static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) || defined(__arm__) - int ygb = yuvconstants->kUVBiasBGR[3]; - int yg = yuvconstants->kYToRgb[1]; + int yg = yuvconstants->kRGBCoeffBias[0]; + int ygb = yuvconstants->kRGBCoeffBias[4]; #else int ygb = yuvconstants->kYBiasToRgb[0]; int yg = yuvconstants->kYToRgb[0]; @@ -1716,38 +1818,6 @@ static __inline void YPixel(uint8_t y, *r = Clamp(((int32_t)(y1) + ygb) >> 6); } -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) -// C mimic assembly. -// TODO(fbarchard): Remove subsampling from Neon. -void I444ToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; - uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; - YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, - yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, - yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - src_u += 2; - src_v += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} -#else void I444ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1765,7 +1835,6 @@ void I444ToARGBRow_C(const uint8_t* src_y, rgb_buf += 4; // Advance 1 pixel. } } -#endif // Also used for 420 void I422ToARGBRow_C(const uint8_t* src_y, @@ -1821,9 +1890,102 @@ void I210ToARGBRow_C(const uint16_t* src_y, } } +void I410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixels. + } +} + +void I210AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = clamp255(src_a[1] >> 2); + src_y += 2; + src_u += 1; + src_v += 1; + src_a += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + } +} + +void I410AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + src_y += 1; + src_u += 1; + src_v += 1; + src_a += 1; + rgb_buf += 4; // Advance 1 pixels. + } +} + +// 12 bit YUV to ARGB +void I212ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { uint32_t ar30; - b = b >> 4; // convert 10.6 to 10 bit. + b = b >> 4; // convert 8 bit 10.6 to 10 bit. g = g >> 4; r = r >> 4; b = Clamp10(b); @@ -1845,9 +2007,9 @@ void I210ToAR30Row_C(const uint16_t* src_y, int g; int r; for (x = 0; x < width - 1; x += 2) { - YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); - YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf + 4, b, g, r); src_y += 2; src_u += 1; @@ -1855,16 +2017,15 @@ void I210ToAR30Row_C(const uint16_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); } } -// 8 bit YUV to 10 bit AR30 -// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. -void I422ToAR30Row_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, +// 12 bit YUV to 10 bit AR30 +void I212ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { @@ -1873,9 +2034,9 @@ void I422ToAR30Row_C(const uint8_t* src_y, int g; int r; for (x = 0; x < width - 1; x += 2) { - YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); - YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf + 4, b, g, r); src_y += 2; src_u += 1; @@ -1883,45 +2044,142 @@ void I422ToAR30Row_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); } } -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) -// C mimic assembly. -// TODO(fbarchard): Remove subsampling from Neon. -void I444AlphaToARGBRow_C(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - const uint8_t* src_a, - uint8_t* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { +void I410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width; ++x) { + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +// P210 has 10 bits in msb of 16 bit NV12 style layout. +void P210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; - uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; - YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, - yuvconstants); - rgb_buf[3] = src_a[0]; - YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, - yuvconstants); - rgb_buf[7] = src_a[1]; + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5, + dst_argb + 6, yuvconstants); + dst_argb[7] = 255; src_y += 2; - src_u += 2; - src_v += 2; - src_a += 2; + src_uv += 2; + dst_argb += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + } +} + +void P410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, + dst_argb + 2, yuvconstants); + dst_argb[3] = 255; + src_y += 1; + src_uv += 2; + dst_argb += 4; // Advance 1 pixels. + } +} + +void P210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30 + 4, b, g, r); + src_y += 2; + src_uv += 2; + dst_ar30 += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + } +} + +void P410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width; ++x) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(dst_ar30, b, g, r); + src_y += 1; + src_uv += 2; + dst_ar30 += 4; // Advance 1 pixel. + } +} + +// 8 bit YUV to 10 bit AR30 +// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - rgb_buf[3] = src_a[0]; + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); } } -#else + void I444AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1941,7 +2199,6 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y, rgb_buf += 4; // Advance 1 pixel. } } -#endif void I422AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, @@ -2492,6 +2749,105 @@ void MergeARGBRow_C(const uint8_t* src_r, } } +void MergeXR30Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + assert(depth >= 10); + assert(depth <= 16); + int x; + int shift = depth - 10; + uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30; + for (x = 0; x < width; ++x) { + uint32_t r = clamp1023(src_r[x] >> shift); + uint32_t g = clamp1023(src_g[x] >> shift); + uint32_t b = clamp1023(src_b[x] >> shift); + dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000; + } +} + +void MergeAR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + assert(depth >= 1); + assert(depth <= 16); + int x; + int shift = 16 - depth; + int max = (1 << depth) - 1; + for (x = 0; x < width; ++x) { + dst_ar64[0] = ClampMax(src_b[x], max) << shift; + dst_ar64[1] = ClampMax(src_g[x], max) << shift; + dst_ar64[2] = ClampMax(src_r[x], max) << shift; + dst_ar64[3] = ClampMax(src_a[x], max) << shift; + dst_ar64 += 4; + } +} + +void MergeARGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + assert(depth >= 8); + assert(depth <= 16); + int x; + int shift = depth - 8; + for (x = 0; x < width; ++x) { + dst_argb[0] = clamp255(src_b[x] >> shift); + dst_argb[1] = clamp255(src_g[x] >> shift); + dst_argb[2] = clamp255(src_r[x] >> shift); + dst_argb[3] = clamp255(src_a[x] >> shift); + dst_argb += 4; + } +} + +void MergeXR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + assert(depth >= 1); + assert(depth <= 16); + int x; + int shift = 16 - depth; + int max = (1 << depth) - 1; + for (x = 0; x < width; ++x) { + dst_ar64[0] = ClampMax(src_b[x], max) << shift; + dst_ar64[1] = ClampMax(src_g[x], max) << shift; + dst_ar64[2] = ClampMax(src_r[x], max) << shift; + dst_ar64[3] = 0xffff; + dst_ar64 += 4; + } +} + +void MergeXRGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + assert(depth >= 8); + assert(depth <= 16); + int x; + int shift = depth - 8; + for (x = 0; x < width; ++x) { + dst_argb[0] = clamp255(src_b[x] >> shift); + dst_argb[1] = clamp255(src_g[x] >> shift); + dst_argb[2] = clamp255(src_r[x] >> shift); + dst_argb[3] = 0xff; + dst_argb += 4; + } +} + void SplitXRGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -2528,6 +2884,8 @@ void MergeUVRow_16_C(const uint16_t* src_u, int depth, int width) { int shift = 16 - depth; + assert(depth >= 8); + assert(depth <= 16); int x; for (x = 0; x < width; ++x) { dst_uv[0] = src_u[x] << shift; @@ -2544,6 +2902,8 @@ void SplitUVRow_16_C(const uint16_t* src_uv, int width) { int shift = 16 - depth; int x; + assert(depth >= 8); + assert(depth <= 16); for (x = 0; x < width; ++x) { dst_u[x] = src_uv[0] >> shift; dst_v[x] = src_uv[1] >> shift; @@ -2581,6 +2941,9 @@ void Convert16To8Row_C(const uint16_t* src_y, int scale, int width) { int x; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < width; ++x) { dst_y[x] = clamp255((src_y[x] * scale) >> 16); } @@ -2714,19 +3077,19 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { #define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f) -// Blend src_argb0 over src_argb1 and store to dst_argb. -// dst_argb may be src_argb0 or src_argb1. +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_C(const uint8_t* src_argb0, +void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint32_t fb = src_argb0[0]; - uint32_t fg = src_argb0[1]; - uint32_t fr = src_argb0[2]; - uint32_t a = src_argb0[3]; + uint32_t fb = src_argb[0]; + uint32_t fg = src_argb[1]; + uint32_t fr = src_argb[2]; + uint32_t a = src_argb[3]; uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; @@ -2735,10 +3098,10 @@ void ARGBBlendRow_C(const uint8_t* src_argb0, dst_argb[2] = BLEND(fr, br, a); dst_argb[3] = 255u; - fb = src_argb0[4 + 0]; - fg = src_argb0[4 + 1]; - fr = src_argb0[4 + 2]; - a = src_argb0[4 + 3]; + fb = src_argb[4 + 0]; + fg = src_argb[4 + 1]; + fr = src_argb[4 + 2]; + a = src_argb[4 + 3]; bb = src_argb1[4 + 0]; bg = src_argb1[4 + 1]; br = src_argb1[4 + 2]; @@ -2746,16 +3109,16 @@ void ARGBBlendRow_C(const uint8_t* src_argb0, dst_argb[4 + 1] = BLEND(fg, bg, a); dst_argb[4 + 2] = BLEND(fr, br, a); dst_argb[4 + 3] = 255u; - src_argb0 += 8; + src_argb += 8; src_argb1 += 8; dst_argb += 8; } if (width & 1) { - uint32_t fb = src_argb0[0]; - uint32_t fg = src_argb0[1]; - uint32_t fr = src_argb0[2]; - uint32_t a = src_argb0[3]; + uint32_t fb = src_argb[0]; + uint32_t fg = src_argb[1]; + uint32_t fr = src_argb[2]; + uint32_t a = src_argb[3]; uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; @@ -3280,7 +3643,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { // Maximum temporary width for wrappers to process at a time, in pixels. #define MAXTWIDTH 2048 -#if !(defined(_MSC_VER) && defined(_M_IX86)) && \ +#if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \ defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. void I422ToRGB565Row_SSSE3(const uint8_t* src_y, @@ -3747,13 +4110,14 @@ void NV21ToYUV24Row_C(const uint8_t* src_y, } // Filter 2 rows of AYUV UV's (444) into UV (420). +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. void AYUVToUVRow_C(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width) { // Output a row of UV values, filtering 2x2 rows of AYUV. int x; - for (x = 0; x < width; x += 2) { + for (x = 0; x < width - 1; x += 2) { dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2; @@ -3764,12 +4128,8 @@ void AYUVToUVRow_C(const uint8_t* src_ayuv, dst_uv += 2; } if (width & 1) { - dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + - src_ayuv[src_stride_ayuv + 0] + 2) >> - 2; - dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + - src_ayuv[src_stride_ayuv + 1] + 2) >> - 2; + dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; + dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; } } @@ -3780,7 +4140,7 @@ void AYUVToVURow_C(const uint8_t* src_ayuv, int width) { // Output a row of VU values, filtering 2x2 rows of AYUV. int x; - for (x = 0; x < width; x += 2) { + for (x = 0; x < width - 1; x += 2) { dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2; @@ -3791,12 +4151,8 @@ void AYUVToVURow_C(const uint8_t* src_ayuv, dst_vu += 2; } if (width & 1) { - dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + - src_ayuv[src_stride_ayuv + 0] + 2) >> - 2; - dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + - src_ayuv[src_stride_ayuv + 1] + 2) >> - 2; + dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; + dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; } } diff --git a/third_party/libyuv/source/row_gcc.cc b/third_party/libyuv/source/row_gcc.cc index faf0fc9104..001c353dbe 100644 --- a/third_party/libyuv/source/row_gcc.cc +++ b/third_party/libyuv/source/row_gcc.cc @@ -16,8 +16,7 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) @@ -1078,6 +1077,222 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3, + 6, 6, 5, 5, 4, 4, 7, 7}; +static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, + 14, 14, 13, 13, 12, 12, 15, 15}; + +void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + + "movdqa %3,%%xmm2 \n" + "movdqa %4,%%xmm3 \n" LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pshufb %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} + +void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + + "movdqa %3,%%xmm2 \n" LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +void ARGBToAR64Row_AVX2(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif + +#ifdef HAS_ARGBTOAB64ROW_AVX2 +void ARGBToAB64Row_AVX2(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm2 \n" + "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm1 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_AR64TOARGBROW_AVX2 +void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif + +#ifdef HAS_AB64TOARGBROW_AVX2 +void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + // clang-format off // TODO(mraptis): Consider passing R, G, B multipliers as parameter. @@ -1290,7 +1505,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { #endif // HAS_RGBATOYJROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, +void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1342,7 +1557,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1359,7 +1574,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; -void ARGBToUVRow_AVX2(const uint8_t* src_argb0, +void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1407,7 +1622,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1422,7 +1637,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ABGRTOUVROW_AVX2 -void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, +void ABGRToUVRow_AVX2(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, @@ -1470,7 +1685,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_abgr0), // %0 + : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1485,7 +1700,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, #endif // HAS_ABGRTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, +void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1534,7 +1749,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1549,7 +1764,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, #endif // HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1602,7 +1817,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1689,7 +1904,7 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { "xmm7"); } -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, @@ -1741,7 +1956,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_bgra0), // %0 + : "+r"(src_bgra), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1786,7 +2001,7 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "xmm7"); } -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, @@ -1838,7 +2053,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_abgr0), // %0 + : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1849,7 +2064,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, @@ -1901,7 +2116,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_rgba0), // %0 + : "+r"(src_rgba), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1916,21 +2131,21 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, // Read 8 UV from 444 #define READYUV444 \ - "movq (%[u_buf]),%%xmm0 \n" \ + "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV #define READYUV422 \ - "movd (%[u_buf]),%%xmm0 \n" \ + "movd (%[u_buf]),%%xmm3 \n" \ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x4(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" @@ -1940,24 +2155,87 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, // TODO(fbarchard): Consider pmulhuw to replace psraw // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. #define READYUV210 \ - "movq (%[u_buf]),%%xmm0 \n" \ + "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklwd %%xmm1,%%xmm0 \n" \ - "psraw $0x2,%%xmm0 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $2,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +#define READYUVA210 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $2,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "movdqu (%[a_buf]),%%xmm5 \n" \ + "psraw $2,%%xmm5 \n" \ + "packuswb %%xmm5,%%xmm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from 444 10 bit +#define READYUV410 \ + "movdqu (%[u_buf]),%%xmm3 \n" \ + "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "psraw $2,%%xmm3 \n" \ + "psraw $2,%%xmm2 \n" \ + "movdqa %%xmm3,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm3 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 444 10 bit. With 8 Alpha. +#define READYUVA410 \ + "movdqu (%[u_buf]),%%xmm3 \n" \ + "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "psraw $2,%%xmm3 \n" \ + "psraw $2,%%xmm2 \n" \ + "movdqa %%xmm3,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm3 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "psllw $0x6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "movdqu (%[a_buf]),%%xmm5 \n" \ + "psraw $2,%%xmm5 \n" \ + "packuswb %%xmm5,%%xmm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 4 UV from 422 12 bit, upsample to 8 UV +#define READYUV212 \ + "movq (%[u_buf]),%%xmm3 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm3 \n" \ + "psraw $0x4,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $0x4,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ - "movd (%[u_buf]),%%xmm0 \n" \ + "movd (%[u_buf]),%%xmm3 \n" \ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x4(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" \ @@ -1966,10 +2244,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, // Read 8 UV from 444. With 8 Alpha. #define READYUVA444 \ - "movq (%[u_buf]),%%xmm0 \n" \ + "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm1,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" \ @@ -1978,18 +2256,18 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, // Read 4 UV from NV12, upsample to 8 UV #define READNV12 \ - "movq (%[uv_buf]),%%xmm0 \n" \ + "movq (%[uv_buf]),%%xmm3 \n" \ "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 VU from NV21, upsample to 8 UV #define READNV21 \ - "movq (%[vu_buf]),%%xmm0 \n" \ + "movq (%[vu_buf]),%%xmm3 \n" \ "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ - "pshufb %[kShuffleNV21], %%xmm0 \n" \ + "pshufb %[kShuffleNV21], %%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" @@ -1998,68 +2276,92 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, #define READYUY2 \ "movdqu (%[yuy2_buf]),%%xmm4 \n" \ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ - "movdqu (%[yuy2_buf]),%%xmm0 \n" \ - "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ + "movdqu (%[yuy2_buf]),%%xmm3 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \ "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. #define READUYVY \ "movdqu (%[uyvy_buf]),%%xmm4 \n" \ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ - "movdqu (%[uyvy_buf]),%%xmm0 \n" \ - "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ + "movdqu (%[uyvy_buf]),%%xmm3 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \ "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" +// Read 4 UV from P210, upsample to 8 UV +#define READP210 \ + "movdqu (%[uv_buf]),%%xmm3 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "psrlw $0x8,%%xmm3 \n" \ + "packuswb %%xmm3,%%xmm3 \n" \ + "punpcklwd %%xmm3,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from P410 +#define READP410 \ + "movdqu (%[uv_buf]),%%xmm3 \n" \ + "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \ + "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ + "psrlw $0x8,%%xmm3 \n" \ + "psrlw $0x8,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm3 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + #if defined(__x86_64__) #define YUVTORGB_SETUP(yuvconstants) \ + "pcmpeqb %%xmm13,%%xmm13 \n" \ "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "pxor %%xmm12,%%xmm12 \n" \ "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ + "psllw $7,%%xmm13 \n" \ "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ + "pshufb %%xmm12,%%xmm13 \n" \ "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm12 \n" \ - "movdqa 160(%[yuvconstants]),%%xmm13 \n" \ - "movdqa 192(%[yuvconstants]),%%xmm14 \n" + "movdqa 128(%[yuvconstants]),%%xmm12 \n" + // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB16(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa %%xmm11,%%xmm0 \n" \ - "pmaddubsw %%xmm8,%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa %%xmm12,%%xmm1 \n" \ - "pmaddubsw %%xmm9,%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa %%xmm13,%%xmm2 \n" \ - "pmaddubsw %%xmm10,%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw %%xmm14,%%xmm4 \n" \ + "psubb %%xmm13,%%xmm3 \n" \ + "pmulhuw %%xmm11,%%xmm4 \n" \ + "movdqa %%xmm8,%%xmm0 \n" \ + "movdqa %%xmm9,%%xmm1 \n" \ + "movdqa %%xmm10,%%xmm2 \n" \ + "paddw %%xmm12,%%xmm4 \n" \ + "pmaddubsw %%xmm3,%%xmm0 \n" \ + "pmaddubsw %%xmm3,%%xmm1 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" -#define YUVTORGB_REGS \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "paddsw %%xmm4,%%xmm2 \n" \ + "psubsw %%xmm1,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm1 \n" + +#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB16(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa 96(%[yuvconstants]),%%xmm0 \n" \ - "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm1 \n" \ - "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa 160(%[yuvconstants]),%%xmm2 \n" \ - "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ + "pcmpeqb %%xmm0,%%xmm0 \n" \ + "pxor %%xmm1,%%xmm1 \n" \ + "psllw $7,%%xmm0 \n" \ + "pshufb %%xmm1,%%xmm0 \n" \ + "psubb %%xmm0,%%xmm3 \n" \ + "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \ + "movdqa (%[yuvconstants]),%%xmm0 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm1 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw %%xmm3,%%xmm0 \n" \ + "pmaddubsw %%xmm3,%%xmm1 \n" \ + "pmaddubsw %%xmm3,%%xmm2 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm3 \n" \ + "paddw %%xmm3,%%xmm4 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" + "paddsw %%xmm4,%%xmm2 \n" \ + "psubsw %%xmm1,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm1 \n" + #define YUVTORGB_REGS #endif @@ -2275,8 +2577,8 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN @@ -2327,6 +2629,36 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, ); } +// 12 bit YUV to ARGB +void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV212 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + // 10 bit YUV to AR30 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, @@ -2340,8 +2672,8 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits - "pxor %%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN @@ -2362,6 +2694,176 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, ); } +// 12 bit YUV to AR30 +void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV212 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 10 bit YUV to ARGB +void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV410 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +#ifdef HAS_I210ALPHATOARGBROW_SSSE3 +// 10 bit YUVA to ARGB +void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA210 + YUVTORGB(yuvconstants) STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} +#endif + +#ifdef HAS_I410ALPHATOARGBROW_SSSE3 +// 10 bit YUVA to ARGB +void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA410 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); + // clang-format on +} +#endif + +// 10 bit YUV to AR30 +void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV410 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + #ifdef HAS_I422ALPHATOARGBROW_SSSE3 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, @@ -2513,6 +3015,112 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, // clang-format on } +void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN "1: \n" READP210 + YUVTORGB(yuvconstants) STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} + +void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP( + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN "1: \n" READP410 + YUVTORGB(yuvconstants) STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} + +void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READP210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" // 0 for min + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READP410 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2546,12 +3154,12 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 16 UV from 444 #define READYUV444_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ @@ -2559,42 +3167,108 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 \ - "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq (%[u_buf]),%%xmm3 \n" \ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" -// Read 8 UV from 210 10 bit, upsample to 16 UV +// Read 8 UV from 210, upsample to 16 UV // TODO(fbarchard): Consider vshufb to replace pack/unpack // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. #define READYUV210_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. +#define READYUVA210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%ymm5 \n" \ + "vpsraw $2,%%ymm5,%%ymm5 \n" \ + "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ + "lea 0x20(%[a_buf]),%[a_buf] \n" + +// Read 16 UV from 410 +#define READYUV410_AVX2 \ + "vmovdqu (%[u_buf]),%%ymm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ + "lea 0x20(%[u_buf]),%[u_buf] \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ + "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 212 12 bit, upsample to 16 UV +#define READYUV212_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x2,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpsraw $0x4,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $0x6,%%ymm4,%%ymm4 \n" \ + "vpsllw $0x4,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" +// Read 16 UV from 410. With 16 Alpha. +#define READYUVA410_AVX2 \ + "vmovdqu (%[u_buf]),%%ymm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ + "lea 0x20(%[u_buf]),%[u_buf] \n" \ + "vpsraw $2,%%ymm3,%%ymm3 \n" \ + "vpsraw $2,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ + "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%ymm5 \n" \ + "vpsraw $2,%%ymm5,%%ymm5 \n" \ + "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ + "lea 0x20(%[a_buf]),%[a_buf] \n" + // Read 16 UV from 444. With 16 Alpha. #define READYUVA444_AVX2 \ - "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ @@ -2605,12 +3279,12 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. #define READYUVA422_AVX2 \ - "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq (%[u_buf]),%%xmm3 \n" \ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ @@ -2621,10 +3295,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 8 UV from NV12, upsample to 16 UV. #define READNV12_AVX2 \ - "vmovdqu (%[uv_buf]),%%xmm0 \n" \ + "vmovdqu (%[uv_buf]),%%xmm3 \n" \ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ @@ -2632,73 +3306,98 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, // Read 8 VU from NV21, upsample to 16 UV. #define READNV21_AVX2 \ - "vmovdqu (%[vu_buf]),%%xmm0 \n" \ + "vmovdqu (%[vu_buf]),%%xmm3 \n" \ "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" +// Read 4 UV from P210, upsample to 8 UV +#define READP210_AVX2 \ + "vmovdqu (%[uv_buf]),%%ymm3 \n" \ + "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ + "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from P410 +#define READP410_AVX2 \ + "vmovdqu (%[uv_buf]),%%ymm3 \n" \ + "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \ + "lea 0x40(%[uv_buf]),%[uv_buf] \n" \ + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. #define READYUY2_AVX2 \ "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ - "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \ - "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ + "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \ + "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \ "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. #define READUYVY_AVX2 \ "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ - "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \ - "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ + "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \ + "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \ "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP_AVX2(yuvconstants) \ - "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ - "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ - "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ - "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ - "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \ - "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ - "vmovdqa 192(%[yuvconstants]),%%ymm14 \n" +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ + "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ + "vpsllw $7,%%xmm13,%%xmm13 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ + "vpbroadcastb %%xmm13,%%ymm13 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ + "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" #define YUVTORGB16_AVX2(yuvconstants) \ - "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ - "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ - "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ - "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ - "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ - "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ - "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ + "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ + "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \ + "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \ + "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \ + "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \ + "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" -#define YUVTORGB_REGS_AVX2 \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", +#define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", #else // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_SETUP_AVX2(yuvconstants) #define YUVTORGB16_AVX2(yuvconstants) \ - "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \ - "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \ - "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \ - "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \ - "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ - "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \ - "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ - "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \ - "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ - "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \ + "vpsllw $7,%%xmm0,%%xmm0 \n" \ + "vpbroadcastb %%xmm0,%%ymm0 \n" \ + "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \ + "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vmovdqa (%[yuvconstants]),%%ymm0 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \ + "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \ + "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \ + "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" + #define YUVTORGB_REGS_AVX2 #endif @@ -2721,7 +3420,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ "vmovdqu %%ymm1,(%[dst_argb]) \n" \ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ - "lea 0x40(%[dst_argb]), %[dst_argb] \n" + "lea 0x40(%[dst_argb]), %[dst_argb] \n" // Store 16 AR30 values. #define STOREAR30_AVX2 \ @@ -2894,6 +3593,41 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, } #endif // HAS_I210TOARGBROW_AVX2 +#if defined(HAS_I212TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV212_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I212TOARGBROW_AVX2 + #if defined(HAS_I210TOAR30ROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). @@ -2929,11 +3663,198 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I210TOAR30ROW_AVX2 +#if defined(HAS_I212TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV212_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I212TOAR30ROW_AVX2 + +#if defined(HAS_I410TOARGBROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I410TOARGBROW_AVX2 + +#if defined(HAS_I210ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). +void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA210_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I410ALPHATOARGBROW_AVX2) +// 16 pixels +// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). +void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP_AVX2( + yuvconstants) "sub %[u_buf],%[v_buf] \n" + + LABELALIGN "1: \n" READYUVA410_AVX2 + YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width] "+m"(width) // %[width] +#else + [width] "+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); +} +#endif // HAS_I410TOARGBROW_AVX2 + +#if defined(HAS_I410TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV410_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I410TOAR30ROW_AVX2 + #if defined(HAS_I444ALPHATOARGBROW_AVX2) // 16 pixels // 16 UV values with 16 Y and 16 A producing 16 ARGB. @@ -3193,14 +4114,154 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, } #endif // HAS_UYVYTOARGBROW_AVX2 +#if defined(HAS_P210TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READP210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_P210TOARGBROW_AVX2 + +#if defined(HAS_P410TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READP410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_P410TOARGBROW_AVX2 + +#if defined(HAS_P210TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READP210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_P210TOAR30ROW_AVX2 + +#if defined(HAS_P410TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READP410_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_P410TOAR30ROW_AVX2 + #ifdef HAS_I400TOARGBROW_SSE2 void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( - "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164 - "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 + "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 + "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 "pslld $0x18,%%xmm4 \n" @@ -3244,8 +4305,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile( - "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164 - "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 + "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 + "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 "vpslld $0x18,%%ymm4,%%ymm4 \n" @@ -3663,8 +4724,6 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%xmm3 \n" "sub %0,%1 \n" // 16 pixels per loop. @@ -3696,7 +4755,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, } #endif // HAS_MERGEUVROW_AVX2 -#ifdef HAS_MERGEUVROW_16_AVX2 +#ifdef HAS_SPLITUVROW_16_AVX2 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}; void SplitUVRow_16_AVX2(const uint16_t* src_uv, @@ -3707,44 +4766,41 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, depth = 16 - depth; // clang-format off asm volatile ( - "vmovd %4,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%xmm3 \n" - "vbroadcastf128 %5,%%ymm4 \n" - "sub %1,%2 \n" + "vmovd %4,%%xmm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + "sub %1,%2 \n" // 16 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "add $0x40,%0 \n" - - "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" - "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x0,%%ymm1,0x10(%1) \n" - "vextractf128 $0x1,%%ymm0,(%1,%2) \n" - "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" - "add $0x20,%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + + "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x0,%%ymm1,0x10(%1) \n" + "vextractf128 $0x1,%%ymm0,(%1,%2) \n" + "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" + "add $0x20,%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width), // %3 - "+r"(depth) // %4 - : + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(depth), // %4 "m"(kSplitUVShuffle16) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); // clang-format on } -#endif // HAS_MERGEUVROW_AVX2 +#endif // HAS_SPLITUVROW_16_AVX2 // Use scale to convert lsb formats to msb, depending how many bits there are: // 128 = 9 bits @@ -3797,24 +4853,24 @@ void DivideRow_16_AVX2(const uint16_t* src_y, int width) { // clang-format off asm volatile ( - "vmovd %3,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "sub %0,%1 \n" + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%0,%1) \n" - "vmovdqu %%ymm1,0x20(%0,%1) \n" - "add $0x40,%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width), // %2 @@ -4202,7 +5258,9 @@ void MergeARGBRow_SSE2(const uint8_t* src_r, : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } +#endif +#ifdef HAS_MERGEXRGBROW_SSE2 void MergeXRGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -4286,7 +5344,9 @@ void MergeARGBRow_AVX2(const uint8_t* src_r, : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } +#endif +#ifdef HAS_MERGEXRGBROW_AVX2 void MergeXRGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -4380,7 +5440,9 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb, : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } +#endif +#ifdef HAS_SPLITXRGBROW_SSE2 void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -4471,12 +5533,14 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } +#endif +#ifdef HAS_SPLITXRGBROW_SSSE3 void SplitXRGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -4562,13 +5626,15 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit), // %6 "m"(kShuffleMaskARGBPermute) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } +#endif +#ifdef HAS_SPLITXRGBROW_AVX2 void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -4610,7 +5676,318 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, "+r"(width) // %4 : "m"(kShuffleMaskARGBSplit), // %5 "m"(kShuffleMaskARGBPermute) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_MERGEXR30ROW_AVX2 +void MergeXR30Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = depth - 10; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vmovd %5,%%xmm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1),%%ymm1 \n" + "vmovdqu (%0,%2),%%ymm2 \n" + "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n" + "vpminuw %%ymm0,%%ymm6,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm6,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm6,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit + "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB + "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" + "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit + "vpslld $0xa,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine + "vpor %%ymm2,%%ymm3,%%ymm3 \n" + "vmovdqu %%ymm0,(%3) \n" + "vmovdqu %%ymm3,0x20(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 +#if defined(__i386__) + : "m"(shift) // %5 +#else + : "rm"(shift) // %5 +#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_MERGEAR64ROW_AVX2 +static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7}; +void MergeAR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + mask = (mask << 16) + mask; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "vmovdqa %8,%%ymm5 \n" + "vmovd %6,%%xmm6 \n" + "vbroadcastss %7,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vmovdqu (%0,%3),%%ymm3 \n" // A + "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" + "vpminuw %%ymm3,%%ymm7,%%ymm3 \n" + "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" + "vpsllw %%xmm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm5,%%ymm0 \n" + "vpermd %%ymm1,%%ymm5,%%ymm1 \n" + "vpermd %%ymm2,%%ymm5,%%ymm2 \n" + "vpermd %%ymm3,%%ymm5,%%ymm3 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) + "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) + "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) + "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) + "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) + "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) + "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) + "vmovdqu %%ymm3,(%4) \n" + "vmovdqu %%ymm2,0x20(%4) \n" + "vmovdqu %%ymm4,0x40(%4) \n" + "vmovdqu %%ymm1,0x60(%4) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%4),%4 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(shift), // %6 + "m"(mask), // %7 + "m"(MergeAR64Permute) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_MERGEXR64ROW_AVX2 +void MergeXR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + mask = (mask << 16) + mask; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vmovdqa %7,%%ymm5 \n" + "vmovd %5,%%xmm6 \n" + "vbroadcastss %6,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" + "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" + "vpermd %%ymm0,%%ymm5,%%ymm0 \n" + "vpermd %%ymm1,%%ymm5,%%ymm1 \n" + "vpermd %%ymm2,%%ymm5,%%ymm2 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff) + "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) + "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) + "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) + "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) + "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) + "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) + "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) + "vmovdqu %%ymm3,(%3) \n" + "vmovdqu %%ymm2,0x20(%3) \n" + "vmovdqu %%ymm4,0x40(%3) \n" + "vmovdqu %%ymm1,0x60(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%3),%3 \n" + "subl $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "m"(shift), // %5 + "m"(mask), // %6 + "m"(MergeAR64Permute) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_AVX2 +static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; +void MergeARGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = depth - 8; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "vbroadcastf128 %7,%%ymm5 \n" + "vmovd %6,%%xmm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vmovdqu (%0,%3),%%ymm3 \n" // A + "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" + "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n" + "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) + "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) + "vmovdqu %%ymm2,(%4) \n" + "vmovdqu %%ymm0,0x20(%4) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%4),%4 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 +#if defined(__i386__) + "+m"(width) // %5 +#else + "+rm"(width) // %5 +#endif + : "m"(shift), // %6 + "m"(MergeARGB16To8Shuffle) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 +void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = depth - 8; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vbroadcastf128 %6,%%ymm5 \n" + "vmovd %5,%%xmm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff) + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) + "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) + "vmovdqu %%ymm2,(%3) \n" + "vmovdqu %%ymm0,0x20(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%3),%3 \n" + "subl $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "m"(shift), // %5 + "m"(MergeARGB16To8Shuffle) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif @@ -5339,7 +6716,7 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, +void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5410,7 +6787,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, "sub $0x1,%3 \n" "jge 91b \n" "99: \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -6012,7 +7389,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -6040,7 +7417,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -6051,7 +7428,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -6078,7 +7455,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -6089,7 +7466,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8_t* src_argb0, +void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -6106,7 +7483,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0, "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -6117,7 +7494,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_AVX2(const uint8_t* src_argb0, +void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -6134,7 +7511,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -6145,7 +7522,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, +void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -6162,7 +7539,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -6173,7 +7550,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, +void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -6190,7 +7567,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -7279,7 +8656,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -7317,7 +8694,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4"); } diff --git a/third_party/libyuv/source/row_mmi.cc b/third_party/libyuv/source/row_mmi.cc index 9a8e2cb2d1..362fd1cfcc 100644 --- a/third_party/libyuv/source/row_mmi.cc +++ b/third_party/libyuv/source/row_mmi.cc @@ -605,7 +605,7 @@ void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, : "memory"); } -void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -613,8 +613,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -626,8 +626,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -639,8 +639,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -652,8 +652,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -671,20 +671,20 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void ARGBToUVRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -700,9 +700,9 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -720,8 +720,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -748,8 +748,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -767,8 +767,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -795,8 +795,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -814,8 +814,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -842,8 +842,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -861,8 +861,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -901,7 +901,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -913,7 +913,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -921,7 +921,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -929,8 +929,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -942,8 +942,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -955,8 +955,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -968,8 +968,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -987,20 +987,20 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void BGRAToUVRow_MMI(const uint8_t* src_rgb0, +void BGRAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1016,9 +1016,9 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1036,8 +1036,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1064,8 +1064,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1083,8 +1083,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1111,8 +1111,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1130,8 +1130,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1158,8 +1158,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1177,8 +1177,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1217,7 +1217,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1229,7 +1229,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -1237,7 +1237,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1245,8 +1245,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1258,8 +1258,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1271,8 +1271,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1284,8 +1284,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1303,20 +1303,20 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void ABGRToUVRow_MMI(const uint8_t* src_rgb0, +void ABGRToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1332,9 +1332,9 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1352,8 +1352,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1380,8 +1380,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1399,8 +1399,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1427,8 +1427,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1446,8 +1446,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1474,8 +1474,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1493,8 +1493,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1533,7 +1533,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1545,7 +1545,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -1553,7 +1553,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1561,8 +1561,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1574,8 +1574,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1587,8 +1587,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1600,8 +1600,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1619,20 +1619,20 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RGBAToUVRow_MMI(const uint8_t* src_rgb0, +void RGBAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1648,9 +1648,9 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1668,8 +1668,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1696,8 +1696,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1715,8 +1715,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1743,8 +1743,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1762,8 +1762,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1790,8 +1790,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1809,8 +1809,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1849,7 +1849,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1861,7 +1861,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -1869,7 +1869,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1877,8 +1877,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1891,8 +1891,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1905,8 +1905,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1919,8 +1919,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1939,20 +1939,20 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, +void RGB24ToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1968,9 +1968,9 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1990,8 +1990,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2020,8 +2020,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2041,8 +2041,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2071,8 +2071,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2092,8 +2092,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2122,8 +2122,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2143,8 +2143,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2185,7 +2185,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2197,7 +2197,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -2205,7 +2205,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -2213,8 +2213,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2227,8 +2227,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2241,8 +2241,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2255,8 +2255,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2275,20 +2275,20 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RAWToUVRow_MMI(const uint8_t* src_rgb0, +void RAWToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -2304,9 +2304,9 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2326,8 +2326,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2356,8 +2356,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2377,8 +2377,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2407,8 +2407,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2428,8 +2428,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2458,8 +2458,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2479,8 +2479,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2521,7 +2521,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2533,7 +2533,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -2541,7 +2541,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest, dest0, dest1, dest2, dest3; uint64_t tmp0, tmp1; @@ -2618,13 +2618,13 @@ void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1) - : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), [width] "r"(width) : "memory"); } -void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVJRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -2637,9 +2637,9 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, __asm__ volatile( "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2655,8 +2655,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2681,8 +2681,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2698,8 +2698,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2724,8 +2724,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2741,8 +2741,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2767,8 +2767,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2784,8 +2784,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2822,7 +2822,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2833,7 +2833,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), @@ -4386,7 +4386,7 @@ void ARGBShadeRow_MMI(const uint8_t* src_argb, : "memory"); } -void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, +void ARGBMultiplyRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4422,12 +4422,12 @@ void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) : "memory"); } -void ARGBAddRow_MMI(const uint8_t* src_argb0, +void ARGBAddRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4449,12 +4449,12 @@ void ARGBAddRow_MMI(const uint8_t* src_argb0, "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } -void ARGBSubtractRow_MMI(const uint8_t* src_argb0, +void ARGBSubtractRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4476,7 +4476,7 @@ void ARGBSubtractRow_MMI(const uint8_t* src_argb0, "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } @@ -5552,10 +5552,10 @@ void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { : "memory"); } -// Blend src_argb0 over src_argb1 and store to dst_argb. -// dst_argb may be src_argb0 or src_argb1. +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_MMI(const uint8_t* src_argb0, +void ARGBBlendRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5608,7 +5608,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0, [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), [shift] "f"(shift), [width] "r"(width) diff --git a/third_party/libyuv/source/row_msa.cc b/third_party/libyuv/source/row_msa.cc index fe6df93a60..c0b13b0fd0 100644 --- a/third_party/libyuv/source/row_msa.cc +++ b/third_party/libyuv/source/row_msa.cc @@ -781,7 +781,7 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, } } -void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; @@ -792,10 +792,10 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); @@ -822,18 +822,18 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ARGBToUVRow_MSA(const uint8_t* src_argb0, +void ARGBToUVRow_MSA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* src_argb0_next = src_argb0 + src_stride_argb; + const uint8_t* src_argb_next = src_argb + src_stride_argb; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; @@ -847,14 +847,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); @@ -875,14 +875,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, reg3 = __msa_hadd_u_h(vec5, vec5); reg4 = __msa_hadd_u_h(vec0, vec0); reg5 = __msa_hadd_u_h(vec1, vec1); - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); @@ -945,8 +945,8 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); - src_argb0 += 128; - src_argb0_next += 128; + src_argb += 128; + src_argb_next += 128; dst_u += 16; dst_v += 16; } @@ -1173,7 +1173,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb, } } -void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, +void ARGBMultiplyRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1184,7 +1184,7 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, v8i16 zero = {0}; for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); @@ -1206,13 +1206,13 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_argb); - src_argb0 += 16; + src_argb += 16; src_argb1 += 16; dst_argb += 16; } } -void ARGBAddRow_MSA(const uint8_t* src_argb0, +void ARGBAddRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1220,20 +1220,20 @@ void ARGBAddRow_MSA(const uint8_t* src_argb0, v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_adds_u_b(src0, src2); dst1 = __msa_adds_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } } -void ARGBSubtractRow_MSA(const uint8_t* src_argb0, +void ARGBSubtractRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1241,14 +1241,14 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0, v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_subs_u_b(src0, src2); dst1 = __msa_subs_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } @@ -1794,7 +1794,7 @@ void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { } } -void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; @@ -1809,9 +1809,9 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); @@ -1830,12 +1830,12 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); - src_argb0 += 48; + src_argb += 48; dst_y += 16; } } -void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; @@ -1850,9 +1850,9 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); @@ -1871,7 +1871,7 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); - src_argb0 += 48; + src_argb += 48; dst_y += 16; } } @@ -2037,14 +2037,14 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, } } -void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, +void RGB24ToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; int64_t res0, res1; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; @@ -2147,14 +2147,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, } } -void RAWToUVRow_MSA(const uint8_t* src_rgb0, +void RAWToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; int64_t res0, res1; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; @@ -2446,7 +2446,7 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx, } } -void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); @@ -2454,19 +2454,19 @@ void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); @@ -2474,19 +2474,19 @@ void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); @@ -2494,19 +2494,19 @@ void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); @@ -2514,26 +2514,26 @@ void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, +void ARGBToUVJRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; v8u16 src0, src1, src2, src3, src4, src5, src6, src7; v8u16 vec0, vec1, vec2, vec3; v8u16 dst0, dst1, dst2, dst3; @@ -2658,14 +2658,14 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, } } -void BGRAToUVRow_MSA(const uint8_t* src_rgb0, +void BGRAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; @@ -2693,14 +2693,14 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0, } } -void ABGRToUVRow_MSA(const uint8_t* src_rgb0, +void ABGRToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; @@ -2728,14 +2728,14 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0, } } -void RGBAToUVRow_MSA(const uint8_t* src_rgb0, +void RGBAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; @@ -3109,7 +3109,7 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, } } -void ARGBBlendRow_MSA(const uint8_t* src_argb0, +void ARGBBlendRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -3123,8 +3123,8 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, v16i8 zero = {0}; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); @@ -3168,7 +3168,7 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, dst0 = __msa_bmnz_v(dst0, const_255, mask); dst1 = __msa_bmnz_v(dst1, const_255, mask); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc index 43a2cac752..6ef6f1c463 100644 --- a/third_party/libyuv/source/row_neon.cc +++ b/third_party/libyuv/source/row_neon.cc @@ -21,90 +21,115 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) +// q0: Y uint16x8_t +// d2: U uint8x8_t +// d3: V uint8x8_t + // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ - "vld1.8 {d0}, [%0]! \n" \ - "vld1.32 {d2[0]}, [%1]! \n" \ - "vld1.32 {d2[1]}, [%2]! \n" + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.32 {d2[0]}, [%[src_u]]! \n" \ + "vld1.32 {d2[1]}, [%[src_v]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmovl.u8 q1, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsli.u16 q1, q1, #8 \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ - "vld1.8 {d0}, [%0]! \n" \ - "vld1.8 {d2}, [%1]! \n" \ - "vld1.8 {d3}, [%2]! \n" \ - "vpaddl.u8 q1, q1 \n" \ - "vrshrn.u16 d2, q1, #1 \n" + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_u]]! \n" \ + "vmovl.u8 q0, d0 \n" \ + "vld1.8 {d3}, [%[src_v]]! \n" \ + "vsli.u16 q0, q0, #8 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ - "vld1.8 {d0}, [%0]! \n" \ - "vmov.u8 d2, #128 \n" + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vmov.u8 q1, #128 \n" \ + "vmovl.u8 q0, d0 \n" \ + "vsli.u16 q0, q0, #8 \n" // Read 8 Y and 4 UV from NV12 -#define READNV12 \ - "vld1.8 {d0}, [%0]! \n" \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" +#define READNV12 \ + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_uv]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \ + "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */ // Read 8 Y and 4 VU from NV21 #define READNV21 \ - "vld1.8 {d0}, [%0]! \n" \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ - "vuzp.u8 d3, d2 \n" \ - "vtrn.u32 d2, d3 \n" + "vld1.8 {d0}, [%[src_y]]! \n" \ + "vld1.8 {d2}, [%[src_vu]]! \n" \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d0, d1 \n" \ + "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \ + "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */ // Read 8 YUY2 #define READYUY2 \ - "vld2.8 {d0, d2}, [%0]! \n" \ + "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \ + "vmovl.u8 q0, d0 \n" \ "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" + "vsli.u16 q0, q0, #8 \n" \ + "vsli.u16 d2, d2, #8 \n" \ + "vsri.u16 d3, d3, #8 \n" // Read 8 UYVY #define READUYVY \ - "vld2.8 {d2, d3}, [%0]! \n" \ - "vmov.u8 d0, d3 \n" \ + "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \ + "vmovl.u8 q0, d3 \n" \ "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" - -#define YUVTORGB_SETUP \ - "vld1.8 {d24}, [%[kUVToRB]] \n" \ - "vld1.8 {d25}, [%[kUVToG]] \n" \ - "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ - "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ - "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ - "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" - -#define YUVTORGB \ - "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \ - "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \ - "vmovl.u8 q0, d0 \n" /* Y */ \ - "vmovl.s16 q10, d1 \n" \ - "vmovl.s16 q0, d0 \n" \ - "vmul.s32 q10, q10, q15 \n" \ - "vmul.s32 q0, q0, q15 \n" \ - "vqshrun.s32 d0, q0, #16 \n" \ - "vqshrun.s32 d1, q10, #16 \n" /* Y */ \ - "vadd.s16 d18, d19 \n" \ - "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \ - "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \ - "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \ - "vaddw.u16 q1, q1, d16 \n" \ - "vaddw.u16 q10, q10, d17 \n" \ - "vaddw.u16 q3, q3, d18 \n" \ - "vqadd.s16 q8, q0, q13 \n" /* B */ \ - "vqadd.s16 q9, q0, q14 \n" /* R */ \ - "vqadd.s16 q0, q0, q4 \n" /* G */ \ - "vqadd.s16 q8, q8, q1 \n" /* B */ \ - "vqadd.s16 q9, q9, q10 \n" /* R */ \ - "vqsub.s16 q0, q0, q3 \n" /* G */ \ - "vqshrun.s16 d20, q8, #6 \n" /* B */ \ - "vqshrun.s16 d22, q9, #6 \n" /* R */ \ - "vqshrun.s16 d21, q0, #6 \n" /* G */ + "vsli.u16 q0, q0, #8 \n" \ + "vsli.u16 d2, d2, #8 \n" \ + "vsri.u16 d3, d3, #8 \n" + +#define YUVTORGB_SETUP \ + "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \ + "vld1.16 {d31[]}, [%[kRGBCoeffBias]]! \n" \ + "vld1.16 {d20[], d21[]}, [%[kRGBCoeffBias]]! \n" \ + "vld1.16 {d22[], d23[]}, [%[kRGBCoeffBias]]! \n" \ + "vld1.16 {d24[], d25[]}, [%[kRGBCoeffBias]] \n" + +// q0: B uint16x8_t +// q1: G uint16x8_t +// q2: R uint16x8_t + +// Convert from YUV to 2.14 fixed point RGB +#define YUVTORGB \ + "vmull.u16 q2, d1, d31 \n" \ + "vmull.u8 q8, d3, d29 \n" /* DGV */ \ + "vmull.u16 q0, d0, d31 \n" \ + "vmlal.u8 q8, d2, d28 \n" /* DG */ \ + "vqshrn.u32 d0, q0, #16 \n" \ + "vqshrn.u32 d1, q2, #16 \n" /* Y */ \ + "vmull.u8 q9, d2, d26 \n" /* DB */ \ + "vmull.u8 q2, d3, d27 \n" /* DR */ \ + "vadd.u16 q4, q0, q11 \n" /* G */ \ + "vadd.u16 q2, q0, q2 \n" /* R */ \ + "vadd.u16 q0, q0, q9 \n" /* B */ \ + "vqsub.u16 q1, q4, q8 \n" /* G */ \ + "vqsub.u16 q0, q0, q10 \n" /* B */ \ + "vqsub.u16 q2, q2, q12 \n" /* R */ + +// Convert from 2.14 fixed point RGB To 8 bit RGB +#define RGBTORGB8 \ + "vqshrn.u16 d4, q2, #6 \n" /* R */ \ + "vqshrn.u16 d2, q1, #6 \n" /* G */ \ + "vqshrn.u16 d0, q0, #6 \n" /* B */ + +#define YUVTORGB_REGS \ + "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31" + +#define STORERGBA \ + "vmov.u8 d1, d0 \n" \ + "vmov.u8 d3, d4 \n" \ + "vmov.u8 d0, d6 \n" \ + "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n" void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -114,22 +139,20 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d6, #255 \n" "1: \n" READYUV444 YUVTORGB - "subs %4, %4, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToARGBRow_NEON(const uint8_t* src_y, @@ -140,22 +163,20 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I444AlphaToARGBRow_NEON(const uint8_t* src_y, @@ -168,22 +189,20 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV444 YUVTORGB - "vld1.8 {d23}, [%3]! \n" - "subs %5, %5, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "vld1.8 {d6}, [%[src_a]]! \n" + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422AlphaToARGBRow_NEON(const uint8_t* src_y, @@ -196,22 +215,20 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %5, %5, #8 \n" - "vld1.8 {d23}, [%3]! \n" - "vst4.8 {d20, d21, d22, d23}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "vld1.8 {d6}, [%[src_a]]! \n" + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToRGBARow_NEON(const uint8_t* src_y, @@ -222,22 +239,18 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgba), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToRGB24Row_NEON(const uint8_t* src_y, @@ -248,29 +261,28 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vst3.8 {d20, d21, d22}, [%3]! \n" + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb24), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } #define ARGBTORGB565 \ - "vshll.u8 q0, d22, #8 \n" /* R */ \ - "vshll.u8 q8, d21, #8 \n" /* G */ \ - "vshll.u8 q9, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #5 \n" /* RG */ \ - "vsri.16 q0, q9, #11 \n" /* RGB */ + "vshll.u8 q2, d4, #8 \n" /* R */ \ + "vshll.u8 q1, d2, #8 \n" /* G */ \ + "vshll.u8 q0, d0, #8 \n" /* B */ \ + "vsri.16 q2, q1, #5 \n" /* RG */ \ + "vsri.16 q2, q0, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -280,31 +292,29 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" ARGBTORGB565 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 + "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } #define ARGBTOARGB1555 \ - "vshll.u8 q0, d23, #8 \n" /* A */ \ - "vshll.u8 q8, d22, #8 \n" /* R */ \ - "vshll.u8 q9, d21, #8 \n" /* G */ \ - "vshll.u8 q10, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #1 \n" /* AR */ \ - "vsri.16 q0, q9, #6 \n" /* ARG */ \ - "vsri.16 q0, q10, #11 \n" /* ARGB */ + "vshll.u8 q3, d6, #8 \n" /* A */ \ + "vshll.u8 q2, d4, #8 \n" /* R */ \ + "vshll.u8 q1, d2, #8 \n" /* G */ \ + "vshll.u8 q0, d0, #8 \n" /* B */ \ + "vsri.16 q3, q2, #1 \n" /* AR */ \ + "vsri.16 q3, q1, #6 \n" /* ARG */ \ + "vsri.16 q3, q0, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -315,30 +325,28 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" ARGBTOARGB1555 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 + "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "q3"); } #define ARGBTOARGB4444 \ - "vshr.u8 d20, d20, #4 \n" /* B */ \ - "vbic.32 d21, d21, d4 \n" /* G */ \ - "vshr.u8 d22, d22, #4 \n" /* R */ \ - "vbic.32 d23, d23, d4 \n" /* A */ \ - "vorr d0, d20, d21 \n" /* BG */ \ - "vorr d1, d22, d23 \n" /* RA */ \ + "vshr.u8 d0, d0, #4 \n" /* B */ \ + "vbic.32 d2, d2, d7 \n" /* G */ \ + "vshr.u8 d4, d4, #4 \n" /* R */ \ + "vbic.32 d6, d6, d7 \n" /* A */ \ + "vorr d0, d0, d2 \n" /* BG */ \ + "vorr d1, d4, d6 \n" /* RA */ \ "vzip.u8 d0, d1 \n" /* BGRA */ void I422ToARGB4444Row_NEON(const uint8_t* src_y, @@ -349,25 +357,21 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d4, #0x0f \n" // vbic bits to clear - "1: \n" - - READYUV422 YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" ARGBTOARGB4444 - "vst1.8 {q0}, [%3]! \n" // store 8 pixels - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb4444), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + "vmov.u8 d6, #255 \n" + "vmov.u8 d7, #0x0f \n" // vbic bits to clear + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %[width], %[width], #8 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "q3"); } void I400ToARGBRow_NEON(const uint8_t* src_y, @@ -376,20 +380,18 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" + "vmov.u8 d6, #255 \n" "1: \n" READYUV400 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + : [src_y] "+r"(src_y), // %[src_y] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { @@ -414,22 +416,20 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void NV21ToARGBRow_NEON(const uint8_t* src_y, @@ -437,22 +437,20 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READNV21 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" + "bgt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_vu] "+r"(src_vu), // %[src_vu] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void NV12ToRGB24Row_NEON(const uint8_t* src_y, @@ -461,25 +459,19 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - - "1: \n" - - READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst3.8 {d20, d21, d22}, [%2]! \n" + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb24), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } void NV21ToRGB24Row_NEON(const uint8_t* src_y, @@ -488,25 +480,19 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - - "1: \n" - - READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst3.8 {d20, d21, d22}, [%2]! \n" + "vmov.u8 d6, #255 \n" + "1: \n" READNV21 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_rgb24), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + : [src_y] "+r"(src_y), // %[src_y] + [src_vu] "+r"(src_vu), // %[src_vu] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } void NV12ToRGB565Row_NEON(const uint8_t* src_y, @@ -516,62 +502,56 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" ARGBTORGB565 - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "vmov.u8 d6, #255 \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" ARGBTORGB565 + "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READYUY2 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READYUY2 YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); + : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READUYVY YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d6, #255 \n" + "1: \n" READUYVY YUVTORGB RGBTORGB8 + "subs %[width], %[width], #8 \n" + "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15"); + : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "d6"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. @@ -760,8 +740,8 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q0}, [%2]! \n" // load B "subs %4, %4, #16 \n" // 16 processed per loop - "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB - "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 @@ -773,6 +753,226 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, ); } +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "vmov.u32 q14, #1023 \n" + "vdup.32 q15, %5 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vshl.u32 q2, q2, q15 \n" // 000B + "vshl.u32 q1, q1, q15 \n" + "vshl.u32 q0, q0, q15 \n" + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "q14", "q15"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "vmov.u32 q14, #1023 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // 000B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + "3: \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q14"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vdup.u16 q15, %6 \n" + "vdup.u16 q14, %7 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vmin.u16 q3, q3, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "subs %5, %5, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%4]! \n" + "vst4.16 {d1, d3, d5, d7}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vmov.u8 q3, #0xff \n" // A (0xffff) + "vdup.u16 q15, %5 \n" + "vdup.u16 q14, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "subs %4, %4, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%3]! \n" + "vst4.16 {d1, d3, d5, d7}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d1, q1 \n" + "vqmovn.u16 d2, q2 \n" + "vqmovn.u16 d3, q3 \n" + "subs %5, %5, #8 \n" + "vst4.8 {d0, d1, d2, d3}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %5 \n" + "vmov.u8 d6, #0xff \n" // A (0xff) + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vqmovn.u16 d5, q2 \n" + "vqmovn.u16 d4, q1 \n" + "vqmovn.u16 d3, q0 \n" + "subs %4, %4, #8 \n" + "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "d6", "q15"); +} + // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( @@ -1328,16 +1528,16 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); + : "cc", "memory", "q0", "q1", "q2", "d6"); } void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, @@ -1345,21 +1545,21 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, const uint32_t dither4, int width) { asm volatile( - "vdup.32 d2, %2 \n" // dither4 + "vdup.32 d7, %2 \n" // dither4 "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d20, d20, d2 \n" - "vqadd.u8 d21, d21, d2 \n" - "vqadd.u8 d22, d22, d2 \n" // add for dither + "vqadd.u8 d0, d0, d7 \n" + "vqadd.u8 d2, d2, d7 \n" + "vqadd.u8 d4, d4, d7 \n" // add for dither ARGBTORGB565 - "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. + "vst1.8 {q2}, [%0]! \n" // store 8 RGB565. "bgt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 "r"(width) // %3 - : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"); + : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, @@ -1367,26 +1567,26 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 - "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. + "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); + : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "vmov.u8 d4, #0x0f \n" // bits to clear with + "vmov.u8 d7, #0x0f \n" // bits to clear with // vbic. "1: \n" - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. @@ -1395,7 +1595,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, "+r"(dst_argb4444), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); + : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -1460,7 +1660,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } -void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient @@ -1474,7 +1674,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" - : "+r"(src_argb), // %0 + : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : @@ -2119,6 +2319,105 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q2}, [%0]! \n" + "vmov.u8 q1, q0 \n" + "vmov.u8 q3, q2 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels + "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + "vld1.8 q4, %3 \n" // shuffler + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q2}, [%0]! \n" + "vtbl.8 d2, {d0, d1}, d8 \n" + "vtbl.8 d3, {d0, d1}, d9 \n" + "vtbl.8 d6, {d4, d5}, d8 \n" + "vtbl.8 d7, {d4, d5}, d9 \n" + "vmov.u8 q0, q1 \n" + "vmov.u8 q2, q3 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels + "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vld1.16 {q3}, [%0]! \n" + "vshrn.u16 d0, q0, #8 \n" + "vshrn.u16 d1, q1, #8 \n" + "vshrn.u16 d4, q2, #8 \n" + "vshrn.u16 d5, q3, #8 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 4 pixels + "vst1.8 {q2}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; + +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + "vld1.8 d8, %3 \n" // shuffler + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vld1.16 {q3}, [%0]! \n" + "vtbl.8 d0, {d0, d1}, d8 \n" + "vtbl.8 d1, {d2, d3}, d8 \n" + "vtbl.8 d4, {d4, d5}, d8 \n" + "vtbl.8 d5, {d6, d7}, d8 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 4 pixels + "vst1.8 {q2}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAB64ToARGB) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient @@ -2263,9 +2562,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q4, d0, d4 \n" // B + "vmull.u8 q4, d0, d4 \n" // R "vmlal.u8 q4, d1, d5 \n" // G - "vmlal.u8 q4, d2, d6 \n" // R + "vmlal.u8 q4, d2, d6 \n" // B "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" @@ -2336,7 +2635,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb0, +void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2387,7 +2686,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, "99: \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2625,7 +2924,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, } // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2645,7 +2944,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2654,7 +2953,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb0, +void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2668,7 +2967,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, "vqadd.u8 q1, q1, q3 \n" // add R, A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2677,7 +2976,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb0, +void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2691,7 +2990,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, "vqsub.u8 q1, q1, q3 \n" // subtract R, A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -3171,32 +3470,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_v, int depth, int width) { + int shift = depth - 16; // Negative for right shift. asm volatile( - "vdup.32 q0, %3 \n" + "vdup.16 q2, %4 \n" "1: \n" - "vld2.16 {q1, q2}, [%0]! \n" // load 8 UV - "vmovl.u16 q3, d2 \n" - "vmovl.u16 q4, d3 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vmovn.u32 d2, q3 \n" - "vmovn.u32 d3, q4 \n" - "vmovl.u16 q3, d4 \n" - "vmovl.u16 q4, d5 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vmovn.u32 d4, q3 \n" - "vmovn.u32 d5, q4 \n" - "subs %4, %4, #8 \n" // 8 src pixels per loop - "vst1.16 {q1}, [%1]! \n" // store 8 U pixels - "vst1.16 {q2}, [%2]! \n" // store 8 V pixels + "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %3, %3, #8 \n" // 8 src pixels per loop + "vst1.16 {q0}, [%1]! \n" // store 8 U pixels + "vst1.16 {q1}, [%2]! \n" // store 8 V pixels "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(depth), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } @@ -3207,21 +3496,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int width) { int shift = 16 - depth; asm volatile( - "vdup.16 q2, %3 \n" + "vdup.16 q2, %4 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // load 8 U "vld1.16 {q1}, [%1]! \n" // load 8 V "vshl.u16 q0, q0, q2 \n" "vshl.u16 q1, q1, q2 \n" - "subs %4, %4, #8 \n" // 8 src pixels per loop + "subs %3, %3, #8 \n" // 8 src pixels per loop "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 - "+r"(shift), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "q0", "q1", "q2"); } diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc index 941c9b9805..da7e3c7cd4 100644 --- a/third_party/libyuv/source/row_neon64.cc +++ b/third_party/libyuv/source/row_neon64.cc @@ -18,93 +18,101 @@ extern "C" { // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +// v0.8h: Y +// v1.16b: 8U, 8V + // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "ld1 {v1.s}[0], [%1], #4 \n" \ - "ld1 {v1.s}[1], [%2], #4 \n" + "ldr d0, [%[src_y]], #8 \n" \ + "ld1 {v1.s}[0], [%[src_u]], #4 \n" \ + "ld1 {v1.s}[1], [%[src_v]], #4 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "zip1 v1.16b, v1.16b, v1.16b \n" \ + "prfm pldl1keep, [%[src_u], 128] \n" \ + "prfm pldl1keep, [%[src_v], 128] \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "ld1 {v1.d}[0], [%1], #8 \n" \ - "ld1 {v1.d}[1], [%2], #8 \n" \ - "uaddlp v1.8h, v1.16b \n" \ - "rshrn v1.8b, v1.8h, #1 \n" + "ldr d0, [%[src_y]], #8 \n" \ + "ld1 {v1.d}[0], [%[src_u]], #8 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "ld1 {v1.d}[1], [%[src_v]], #8 \n" \ + "prfm pldl1keep, [%[src_u], 448] \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_v], 448] \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "movi v1.8b , #128 \n" + "ldr d0, [%[src_y]], #8 \n" \ + "movi v1.16b, #128 \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" + +static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6, + 1, 1, 3, 3, 5, 5, 7, 7}; +static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, + 0, 0, 2, 2, 4, 4, 6, 6}; -// Read 8 Y and 4 UV from NV12 +// Read 8 Y and 4 UV from NV12 or NV21 #define READNV12 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" - -// Read 8 Y and 4 VU from NV21 -#define READNV21 \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v3.8b, v2.8b, v2.8b \n" \ - "uzp2 v1.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" + "ldr d0, [%[src_y]], #8 \n" \ + "ldr d1, [%[src_uv]], #8 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_y], 448] \n" \ + "tbl v1.16b, {v1.16b}, v2.16b \n" \ + "prfm pldl1keep, [%[src_uv], 448] \n" // Read 8 YUY2 -#define READYUY2 \ - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ - "uzp2 v3.8b, v1.8b, v1.8b \n" \ - "uzp1 v1.8b, v1.8b, v1.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READYUY2 \ + "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \ + "zip1 v0.16b, v0.16b, v0.16b \n" \ + "prfm pldl1keep, [%[src_yuy2], 448] \n" \ + "tbl v1.16b, {v1.16b}, v2.16b \n" // Read 8 UYVY -#define READUYVY \ - "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ - "orr v0.8b, v3.8b, v3.8b \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" - -#define YUVTORGB_SETUP \ - "ld3r {v24.8h, v25.8h, v26.8h}, [%[kUVBiasBGR]] \n" \ - "ld1r {v31.4s}, [%[kYToRgb]] \n" \ - "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ - "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" - -// clang-format off - -#define YUVTORGB(vR, vG, vB) \ - "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ - "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ - "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ - "ushll v0.4s, v0.4h, #0 \n" \ - "mul v3.4s, v3.4s, v31.4s \n" \ - "mul v0.4s, v0.4s, v31.4s \n" \ - "sqshrun v0.4h, v0.4s, #16 \n" \ - "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ - "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ - "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ - "uxtl v2.8h, v2.8b \n" \ - "uxtl v1.8h, v1.8b \n" /* Extract U */ \ - "mul v3.8h, v27.8h, v1.8h \n" \ - "mul v5.8h, v29.8h, v1.8h \n" \ - "mul v6.8h, v30.8h, v2.8h \n" \ - "mul v7.8h, v28.8h, v2.8h \n" \ - "sqadd v6.8h, v6.8h, v5.8h \n" \ - "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ - "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ - "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ - "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ - "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ - "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ - "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ - "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ - "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ - -// clang-format on +#define READUYVY \ + "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \ + "zip1 v0.16b, v4.16b, v4.16b \n" \ + "prfm pldl1keep, [%[src_uyvy], 448] \n" \ + "tbl v1.16b, {v3.16b}, v2.16b \n" + +// UB VR UG VG +// YG BB BG BR +#define YUVTORGB_SETUP \ + "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \ + "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n" + +// v16.8h: B +// v17.8h: G +// v18.8h: R + +// Convert from YUV to 2.14 fixed point RGB +#define YUVTORGB \ + "umull2 v3.4s, v0.8h, v24.8h \n" \ + "umull v6.8h, v1.8b, v30.8b \n" \ + "umull v0.4s, v0.4h, v24.4h \n" \ + "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \ + "uqshrn v0.4h, v0.4s, #16 \n" \ + "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ + "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \ + "add v17.8h, v0.8h, v26.8h \n" /* G */ \ + "add v16.8h, v0.8h, v4.8h \n" /* B */ \ + "add v18.8h, v0.8h, v5.8h \n" /* R */ \ + "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ + "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ + "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ + +// Convert from 2.14 fixed point RGB To 8 bit RGB +#define RGBTORGB8 \ + "uqshrn v17.8b, v17.8h, #6 \n" \ + "uqshrn v16.8b, v16.8h, #6 \n" \ + "uqshrn v18.8b, v18.8h, #6 \n" + +#define YUVTORGB_REGS \ + "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \ + "v26", "v27", "v28", "v29", "v30", "v31" void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -112,30 +120,22 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" - READYUV444 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" /* A */ + "1: \n" READYUV444 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422ToARGBRow_NEON(const uint8_t* src_y, @@ -144,31 +144,22 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - - "1: \n" - READYUV422 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" /* A */ + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } void I444AlphaToARGBRow_NEON(const uint8_t* src_y, @@ -178,32 +169,23 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP + asm volatile( + YUVTORGB_SETUP "1: \n" - READYUV444 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "ld1 {v23.8b}, [%3], #8 \n" - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "prfm pldl1keep, [%3, 448] \n" - "subs %w5, %w5, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 + "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422AlphaToARGBRow_NEON(const uint8_t* src_y, @@ -213,32 +195,23 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP + asm volatile( + YUVTORGB_SETUP "1: \n" - READYUV422 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "ld1 {v23.8b}, [%3], #8 \n" - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "prfm pldl1keep, [%3, 448] \n" - "subs %w5, %w5, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 + "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [src_a] "+r"(src_a), // %[src_a] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422ToRGBARow_NEON(const uint8_t* src_y, @@ -247,30 +220,22 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v20.8b, #255 \n" /* A */ - "1: \n" - READYUV422 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v23, v22, v21) - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "subs %w4, %w4, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgba), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v15.8b, #255 \n" /* A */ + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v15"); } void I422ToRGB24Row_NEON(const uint8_t* src_y, @@ -279,39 +244,29 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "subs %w4, %w4, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb24), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } #define ARGBTORGB565 \ - "shll v0.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v21.8h, #5 \n" /* RG */ \ - "sri v0.8h, v20.8h, #11 \n" /* RGB */ - -// clang-format off + "shll v18.8h, v18.8b, #8 \n" /* R */ \ + "shll v17.8h, v17.8b, #8 \n" /* G */ \ + "shll v16.8h, v16.8b, #8 \n" /* B */ \ + "sri v18.8h, v17.8h, #5 \n" /* RG */ \ + "sri v18.8h, v16.8h, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -320,38 +275,29 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #8 \n" - ARGBTORGB565 - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565 + "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS); } #define ARGBTOARGB1555 \ - "shll v0.8h, v23.8b, #8 \n" /* A */ \ - "shll v22.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v22.8h, #1 \n" /* AR */ \ - "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ - "sri v0.8h, v20.8h, #11 \n" /* ARGB */ + "shll v0.8h, v19.8b, #8 \n" /* A */ \ + "shll v18.8h, v18.8b, #8 \n" /* R */ \ + "shll v17.8h, v17.8b, #8 \n" /* G */ \ + "shll v16.8h, v16.8b, #8 \n" /* B */ \ + "sri v0.8h, v18.8h, #1 \n" /* AR */ \ + "sri v0.8h, v17.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v16.8h, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const uint8_t* src_u, @@ -360,40 +306,32 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #8 \n" - ARGBTOARGB1555 - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" ARGBTOARGB1555 + "st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } -// clang-format on #define ARGBTOARGB4444 \ - /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ - "ushr v20.8b, v20.8b, #4 \n" /* B */ \ - "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ - "ushr v22.8b, v22.8b, #4 \n" /* R */ \ - "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ - "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ - "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \ + "ushr v16.8b, v16.8b, #4 \n" /* B */ \ + "bic v17.8b, v17.8b, v23.8b \n" /* G */ \ + "ushr v18.8b, v18.8b, #4 \n" /* R */ \ + "bic v19.8b, v19.8b, v23.8b \n" /* A */ \ + "orr v0.8b, v16.8b, v17.8b \n" /* BG */ \ + "orr v1.8b, v18.8b, v19.8b \n" /* RA */ \ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ void I422ToARGB4444Row_NEON(const uint8_t* src_y, @@ -402,58 +340,46 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v4.16b, #0x0f \n" // bits to clear with vbic. - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w4, %w4, #8 \n" - "movi v23.8b, #255 \n" - ARGBTOARGB4444 - "prfm pldl1keep, [%1, 128] \n" - "prfm pldl1keep, [%2, 128] \n" - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb4444), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v23.16b, #0x0f \n" // bits to clear with + // vbic. + "1: \n" READYUV422 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "movi v19.8b, #255 \n" ARGBTOARGB4444 + "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 + // pixels + // ARGB4444. + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19", "v23"); } void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READYUV400 - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "1: \n" READYUV400 YUVTORGB + RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + : [src_y] "+r"(src_y), // %[src_y] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] + : "cc", "memory", YUVTORGB_REGS, "v19"); } void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { @@ -479,28 +405,22 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READNV12 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void NV21ToARGBRow_NEON(const uint8_t* src_y, @@ -508,28 +428,22 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READNV21 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_vu), // %[src_uv] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV21Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void NV12ToRGB24Row_NEON(const uint8_t* src_y, @@ -537,27 +451,21 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READNV12 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb24), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); } void NV21ToRGB24Row_NEON(const uint8_t* src_y, @@ -565,27 +473,21 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READNV21 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_rgb24), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" + "b.gt 1b \n" + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_vu), // %[src_uv] + [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV21Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); } void NV12ToRGB565Row_NEON(const uint8_t* src_y, @@ -594,75 +496,64 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP "1: \n" READNV12 - "prfm pldl1keep, [%0, 448] \n" YUVTORGB( - v22, v21, v20) ARGBTORGB565 - "prfm pldl1keep, [%1, 256] \n" - "subs %w3, %w3, #8 \n" - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels + YUVTORGB_SETUP + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READNV12 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" ARGBTORGB565 + "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 + // pixels + // RGB565. "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2"); } void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READYUY2 - "prfm pldl1keep, [%0, 448] \n" - YUVTORGB(v22, v21, v20) - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READYUY2 YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READUYVY - YUVTORGB(v22, v21, v20) - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "ldr q2, [%[kNV12Table]] \n" + "1: \n" READUYVY YUVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" + : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] + [kNV12Table] "r"(&kNV12Table) + : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. @@ -673,8 +564,8 @@ void SplitUVRow_NEON(const uint8_t* src_uv, asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store U "st1 {v1.16b}, [%2], #16 \n" // store V "b.gt 1b \n" @@ -696,9 +587,9 @@ void MergeUVRow_NEON(const uint8_t* src_u, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV "b.gt 1b \n" : "+r"(src_u), // %0 @@ -719,8 +610,8 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, asm volatile( "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store R "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%3], #16 \n" // store B @@ -746,12 +637,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r, "ld1 {v0.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB - "prfm pldl1keep, [%0, 448] \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 @@ -773,8 +663,8 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba, asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%3], #16 \n" // store B "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%1], #16 \n" // store R @@ -804,11 +694,11 @@ void MergeARGBRow_NEON(const uint8_t* src_r, "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v3.16b}, [%3], #16 \n" // load A + "subs %w5, %w5, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" "prfm pldl1keep, [%3, 448] \n" - "subs %w5, %w5, #16 \n" // 16 processed per loop "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 @@ -831,8 +721,8 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba, asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%3], #16 \n" // store B "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%1], #16 \n" // store R @@ -859,10 +749,10 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, "ld1 {v2.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v0.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 @@ -875,6 +765,240 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, ); } +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "movi v30.16b, #255 \n" + "ushr v30.4s, v30.4s, #22 \n" // 1023 + "dup v31.4s, %w5 \n" + "1: \n" + "ldr d2, [%2], #8 \n" // B + "ldr d1, [%1], #8 \n" // G + "ldr d0, [%0], #8 \n" // R + "ushll v2.4s, v2.4h, #0 \n" // B + "ushll v1.4s, v1.4h, #0 \n" // G + "ushll v0.4s, v0.4h, #0 \n" // R + "ushl v2.4s, v2.4s, v31.4s \n" // 000B + "ushl v1.4s, v1.4s, v31.4s \n" // G + "ushl v0.4s, v0.4s, v31.4s \n" // R + "umin v2.4s, v2.4s, v30.4s \n" + "umin v1.4s, v1.4s, v30.4s \n" + "umin v0.4s, v0.4s, v30.4s \n" + "sli v2.4s, v1.4s, #10 \n" // 00GB + "sli v2.4s, v0.4s, #20 \n" // 0RGB + "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) + "subs %w4, %w4, #4 \n" + "str q2, [%3], #16 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "movi v30.16b, #255 \n" + "ushr v30.4s, v30.4s, #22 \n" // 1023 + "1: \n" + "ldr d2, [%2], #8 \n" // B + "ldr d1, [%1], #8 \n" // G + "ldr d0, [%0], #8 \n" // R + "ushll v2.4s, v2.4h, #0 \n" // 000B + "ushll v1.4s, v1.4h, #0 \n" // G + "ushll v0.4s, v0.4h, #0 \n" // R + "umin v2.4s, v2.4s, v30.4s \n" + "umin v1.4s, v1.4s, v30.4s \n" + "umin v0.4s, v0.4s, v30.4s \n" + "sli v2.4s, v1.4s, #10 \n" // 00GB + "sli v2.4s, v0.4s, #20 \n" // 0RGB + "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) + "subs %w4, %w4, #4 \n" + "str q2, [%3], #16 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "v0", "v1", "v2", "v30"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "dup v30.8h, %w7 \n" + "dup v31.8h, %w6 \n" + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ldr q3, [%3], #16 \n" // A + "umin v2.8h, v2.8h, v30.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "umin v1.8h, v1.8h, v30.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "umin v0.8h, v0.8h, v30.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umin v3.8h, v3.8h, v30.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "ushl v2.8h, v2.8h, v31.8h \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "ushl v3.8h, v3.8h, v31.8h \n" + "subs %w5, %w5, #8 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "movi v3.16b, #0xff \n" // A (0xffff) + "dup v30.8h, %w6 \n" + "dup v31.8h, %w5 \n" + + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "umin v2.8h, v2.8h, v30.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "umin v1.8h, v1.8h, v30.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "umin v0.8h, v0.8h, v30.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v2.8h, v2.8h, v31.8h \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "subs %w4, %w4, #8 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "dup v31.8h, %w6 \n" + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ldr q3, [%3], #16 \n" // A + "ushl v2.8h, v2.8h, v31.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v3.8h, v3.8h, v31.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "uqxtn v2.8b, v2.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v0.8b, v0.8h \n" + "uqxtn v3.8b, v3.8h \n" + "subs %w5, %w5, #8 \n" + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "dup v31.8h, %w5 \n" + "movi v3.8b, #0xff \n" // A (0xff) + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ushl v2.8h, v2.8h, v31.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "uqxtn v2.8b, v2.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v0.8b, v0.8h \n" + "subs %w4, %w4, #8 \n" + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + // Copy multiple of 32. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( @@ -1072,10 +1196,10 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "movi v5.8b, #255 \n" // Alpha "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r + "orr v4.8b, v0.8b, v0.8b \n" // move r "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1091,10 +1215,10 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { "movi v0.8b, #255 \n" // Alpha "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v2.8b, v4.8b, v4.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v2.8b, v4.8b, v4.8b \n" // move g - "orr v1.8b, v5.8b, v5.8b \n" // move r + "orr v1.8b, v5.8b, v5.8b \n" // move r "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1109,9 +1233,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" "orr v4.8b, v0.8b, v0.8b \n" // move r "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r "b.gt 1b \n" @@ -1143,9 +1267,8 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, "movi v3.8b, #255 \n" // Alpha "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_rgb565), // %0 @@ -1233,9 +1356,8 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB + "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb4444), // %0 @@ -1252,8 +1374,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of // RGB24 "b.gt 1b \n" @@ -1269,9 +1391,9 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g "orr v5.8b, v1.8b, v1.8b \n" // mov b "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b "b.gt 1b \n" @@ -1287,8 +1409,8 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" : "+r"(src_yuy2), // %0 @@ -1303,8 +1425,8 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" : "+r"(src_uyvy), // %0 @@ -1322,8 +1444,8 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" "st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v3.8b}, [%2], #8 \n" // store 8 V. "b.gt 1b \n" @@ -1343,8 +1465,8 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v2.8b}, [%2], #8 \n" // store 8 V. "b.gt 1b \n" @@ -1366,10 +1488,10 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V "st1 {v1.8b}, [%2], #8 \n" // store 8 U. "st1 {v3.8b}, [%3], #8 \n" // store 8 V. @@ -1394,10 +1516,10 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V "st1 {v0.8b}, [%2], #8 \n" // store 8 U. "st1 {v2.8b}, [%3], #8 \n" // store 8 V. @@ -1422,8 +1544,8 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #4 \n" // 4 processed per loop + "prfm pldl1keep, [%0, 448] \n" "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels "st1 {v1.16b}, [%1], #16 \n" // store 4. "b.gt 1b \n" @@ -1443,11 +1565,11 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 pixels "orr v2.8b, v1.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -1467,8 +1589,8 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "prfm pldl1keep, [%0, 448] \n" "orr v3.8b, v2.8b, v2.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %w4, %w4, #16 \n" // 16 pixels @@ -1488,18 +1610,17 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 + "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); + : "cc", "memory", "v16", "v17", "v18", "v19"); } void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, @@ -1509,20 +1630,20 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, asm volatile( "dup v1.4s, %w2 \n" // dither4 "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v20.8b, v20.8b, v1.8b \n" - "uqadd v21.8b, v21.8b, v1.8b \n" - "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 - "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "uqadd v16.8b, v16.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "uqadd v17.8b, v17.8b, v1.8b \n" + "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565 + "st1 {v18.16b}, [%0], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 "r"(width) // %3 - : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"); + : "cc", "memory", "v1", "v16", "v17", "v18", "v19"); } void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, @@ -1530,39 +1651,131 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); + : "cc", "memory", "v0", "v16", "v17", "v18", "v19"); } void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( - "movi v4.16b, #0x0f \n" // bits to clear with + "movi v23.16b, #0x0f \n" // bits to clear with // vbic. "1: \n" - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 : - : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"); + : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); +} + +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "mov v1.16b, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "mov v3.16b, v2.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels + "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + "ld1 {v4.16b}, %3 \n" // shuffler + "1: \n" + "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "tbl v0.16b, {v0.16b}, v4.16b \n" + "tbl v2.16b, {v2.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "mov v1.16b, v0.16b \n" + "mov v3.16b, v2.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels + "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31}; + +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + "ld1 {v4.16b}, %3 \n" // shuffler + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 4 pixels + "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "stp q0, q2, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAR64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, + 21, 19, 17, 23, 29, 27, 25, 31}; + +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + "ld1 {v4.16b}, %3 \n" // shuffler + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 4 pixels + "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "stp q0, q2, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAB64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -1573,9 +1786,9 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -1614,9 +1827,9 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -1629,22 +1842,22 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } -void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #29 \n" // B * 0.1140 coefficient "movi v5.8b, #150 \n" // G * 0.5870 coefficient "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v0.8h, v1.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v2.8b, v5.8b \n" // G "umlal v0.8h, v3.8b, v6.8b \n" // R "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" - : "+r"(src_argb), // %0 + : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : @@ -1666,9 +1879,9 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "movi v29.16b,#0x80 \n" // 128.5 "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlsl v4.8h, v1.8b, v25.8b \n" // G "umlsl v4.8h, v2.8b, v26.8b \n" // R "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned @@ -1729,14 +1942,14 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -1775,13 +1988,13 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -1815,13 +2028,13 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. @@ -1855,13 +2068,13 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. @@ -1895,13 +2108,13 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. @@ -1935,13 +2148,13 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -1975,13 +2188,13 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels - "prfm pldl1keep, [%1, 448] \n" "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. @@ -2016,9 +2229,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. @@ -2028,9 +2241,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%1, 448] \n" RGB565TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. @@ -2074,9 +2287,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" RGB555TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. @@ -2086,9 +2299,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%1, 448] \n" RGB555TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. @@ -2132,9 +2345,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. @@ -2144,9 +2357,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%1, 448] \n" ARGB4444TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. @@ -2189,10 +2402,10 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { "movi v27.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2217,10 +2430,10 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2244,10 +2457,10 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, "movi v27.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2269,9 +2482,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // R + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // B "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2293,9 +2506,9 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // R + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // B "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2317,9 +2530,9 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2341,9 +2554,9 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2365,9 +2578,9 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2388,9 +2601,9 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y @@ -2410,9 +2623,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "movi v4.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y @@ -2446,11 +2659,11 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "1: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w3, %w3, #16 \n" "umull v2.8h, v0.8b, v4.8b \n" + "prfm pldl1keep, [%1, 448] \n" "umull2 v3.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%2, 448] \n" "umlal v2.8h, v1.8b, v5.8b \n" "umlal2 v3.8h, v1.16b, v5.16b \n" "rshrn v0.8b, v2.8h, #8 \n" @@ -2463,10 +2676,10 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "50: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 50b \n" "b 99f \n" @@ -2474,8 +2687,8 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, // Blend 100 / 0 - Copy row unchanged. "100: \n" "ld1 {v0.16b}, [%1], #16 \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" @@ -2491,7 +2704,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb0, +void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2502,11 +2715,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, "8: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" "umull v18.8h, v6.8b, v3.8b \n" // dr * a "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 @@ -2532,11 +2745,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, // ARGB0. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel // ARGB1. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #1 \n" // 1 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" "umull v18.8h, v6.8b, v3.8b \n" // dr * a "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 @@ -2553,7 +2766,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, "99: \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2570,14 +2783,14 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, // Attenuate 8 pixels. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "prfm pldl1keep, [%0, 448] \n" + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -2603,9 +2816,9 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, // 8 pixel loop. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) "prfm pldl1keep, [%0, 448] \n" - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v1.8h, v1.8b \n" "uxtl v2.8h, v2.8b \n" "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale @@ -2645,9 +2858,9 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // 8 pixel loop. "1: \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" "uxtl v5.8h, v5.8b \n" "uxtl v6.8h, v6.8b \n" "uxtl v7.8h, v7.8b \n" @@ -2678,9 +2891,9 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { "movi v26.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B @@ -2713,9 +2926,9 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { "movi v30.8b, #50 \n" // BR coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "prfm pldl1keep, [%0, 448] \n" - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "umlal v4.8h, v1.8b, v21.8b \n" // G "umlal v4.8h, v2.8b, v22.8b \n" // R "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G @@ -2750,9 +2963,9 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "prfm pldl1keep, [%0, 448] \n" "uxtl v17.8h, v17.8b \n" // g "uxtl v18.8h, v18.8b \n" // r "uxtl v19.8h, v19.8b \n" // a @@ -2800,7 +3013,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2809,11 +3022,11 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "prfm pldl1keep, [%0, 448] \n" "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "prfm pldl1keep, [%1, 448] \n" "umull v2.8h, v2.8b, v6.8b \n" // multiply R "umull v3.8h, v3.8b, v7.8b \n" // multiply A "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B @@ -2822,7 +3035,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2831,7 +3044,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb0, +void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2840,16 +3053,16 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" "uqadd v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" "uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2858,7 +3071,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb0, +void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2867,16 +3080,16 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqsub v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" "uqsub v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" "uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2899,11 +3112,11 @@ void SobelRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v1.8b \n" // add + "prfm pldl1keep, [%0, 448] \n" "orr v1.8b, v0.8b, v0.8b \n" + "prfm pldl1keep, [%1, 448] \n" "orr v2.8b, v0.8b, v0.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" @@ -2925,10 +3138,10 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "uqadd v0.16b, v0.16b, v1.16b \n" // add + "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "b.gt 1b \n" : "+r"(src_sobelx), // %0 @@ -2954,10 +3167,10 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "uqadd v1.8b, v0.8b, v2.8b \n" // add + "prfm pldl1keep, [%1, 448] \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_sobelx), // %0 @@ -2981,18 +3194,18 @@ void SobelXRow_NEON(const uint8_t* src_y0, "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" - "prfm pldl1keep, [%0, 448] \n" "usubl v0.8h, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v2.8b}, [%1],%5 \n" // center * 2 "ld1 {v3.8b}, [%1],%6 \n" - "prfm pldl1keep, [%1, 448] \n" "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%1, 448] \n" "add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%2],%5 \n" // bottom "ld1 {v3.8b}, [%2],%6 \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w4, %w4, #8 \n" // 8 pixels + "prfm pldl1keep, [%2, 448] \n" "usubl v1.8h, v2.8b, v3.8b \n" "add v0.8h, v0.8h, v1.8h \n" "abs v0.8h, v0.8h \n" @@ -3030,11 +3243,11 @@ void SobelYRow_NEON(const uint8_t* src_y0, "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%0],%5 \n" // right "ld1 {v3.8b}, [%1],%5 \n" - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 pixels "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" "add v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%1, 448] \n" "abs v0.8h, v0.8h \n" "uqxtn v0.8b, v0.8h \n" "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely @@ -3057,9 +3270,9 @@ void HalfFloat1Row_NEON(const uint16_t* src, asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" @@ -3081,9 +3294,9 @@ void HalfFloatRow_NEON(const uint16_t* src, asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" @@ -3107,9 +3320,9 @@ void ByteToFloatRow_NEON(const uint8_t* src, asm volatile( "1: \n" "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v1.8h, v1.8b \n" // 8 shorts + "prfm pldl1keep, [%0, 448] \n" "uxtl v2.4s, v1.4h \n" // 8 ints "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats @@ -3136,9 +3349,9 @@ float ScaleMaxSamples_NEON(const float* src, "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" "fmul v4.4s, v2.4s, %4.s[0] \n" // scale "fmax v5.4s, v5.4s, v1.4s \n" // max "fmax v6.4s, v6.4s, v2.4s \n" @@ -3166,9 +3379,9 @@ float ScaleSumSamples_NEON(const float* src, "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" "fmul v4.4s, v2.4s, %4.s[0] \n" "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares "fmla v6.4s, v2.4s, v2.4s \n" @@ -3376,10 +3589,10 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values "prfm pldl1keep, [%0, 448] \n" + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values "prfm pldl1keep, [%1, 448] \n" - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values "subs %w3, %w3, #16 \n" // 16 pixels per loop "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels "b.gt 1b \n" @@ -3391,6 +3604,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, : "cc", "memory", "v0", "v1", "v2"); } +// AYUV is YVUA in memory. UV for NV12 is UV order in memory. void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, @@ -3400,12 +3614,12 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v2.8b, v1.8h, #2 \n" @@ -3429,12 +3643,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v1.8b, v1.8h, #2 \n" @@ -3454,8 +3668,8 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels "b.gt 1b \n" : "+r"(src_ayuv), // %0 @@ -3476,9 +3690,9 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "1: \n" "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values "ld1 {v1.16b}, [%0], 16 \n" - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop "tbl v0.16b, {v0.16b}, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" "tbl v1.16b, {v1.16b}, v2.16b \n" "stp q0, q1, [%1], 32 \n" // store 16 VU pixels "b.gt 1b \n" @@ -3531,34 +3745,24 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_v, int depth, int width) { + int shift = depth - 16; // Negative for right shift. asm volatile( - "dup v0.4s, %w3 \n" + "dup v2.8h, %w4 \n" "1: \n" - "ld2 {v1.8h, v2.8h}, [%0], #32 \n" // load 8 UV + "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ushl v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" - "ushll v3.4s, v1.4h, #0 \n" - "ushll2 v4.4s, v1.8h, #0 \n" - "ushl v3.4s, v3.4s, v0.4s \n" - "ushl v4.4s, v4.4s, v0.4s \n" - "xtn v1.4h, v3.4s \n" - "xtn2 v1.8h, v4.4s \n" - "ushll v3.4s, v2.4h, #0 \n" - "ushll2 v4.4s, v2.8h, #0 \n" - "ushl v3.4s, v3.4s, v0.4s \n" - "ushl v4.4s, v4.4s, v0.4s \n" - "xtn v2.4h, v3.4s \n" - "xtn2 v2.8h, v4.4s \n" - "subs %w4, %w4, #8 \n" // 8 src pixels per loop - "st1 {v1.8h}, [%1], #16 \n" // store 8 U pixels - "st1 {v2.8h}, [%2], #16 \n" // store 8 V pixels + "ushl v1.8h, v1.8h, v2.8h \n" + "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels + "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(depth), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2"); } void MergeUVRow_16_NEON(const uint16_t* src_u, @@ -3568,23 +3772,22 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int width) { int shift = 16 - depth; asm volatile( - "dup v2.8h, %w3 \n" + "dup v2.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U - "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ld1 {v1.8h}, [%1], #16 \n" // load 8 V - "prfm pldl1keep, [%1, 448] \n" "ushl v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" "ushl v1.8h, v1.8h, v2.8h \n" - "subs %w4, %w4, #8 \n" // 8 src pixels per loop + "prfm pldl1keep, [%1, 448] \n" "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 - "+r"(shift), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "v0", "v1", "v2"); } @@ -3595,10 +3798,9 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, asm volatile( "dup v2.8h, %w2 \n" "1: \n" - "ldp q0, q1, [%0] \n" - "add %0, %0, #32 \n" - "prfm pldl1keep, [%0, 448] \n" + "ldp q0, q1, [%0], #32 \n" "mul v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" "mul v1.8h, v1.8h, v2.8h \n" "stp q0, q1, [%1] \n" // store 16 pixels "add %1, %1, #32 \n" @@ -3619,11 +3821,10 @@ void DivideRow_16_NEON(const uint16_t* src_y, asm volatile( "dup v0.8h, %w2 \n" "1: \n" - "ldp q1, q2, [%0] \n" - "add %0, %0, #32 \n" - "prfm pldl1keep, [%0, 448] \n" + "ldp q1, q2, [%0], #32 \n" "ushll v3.4s, v1.4h, #0 \n" "ushll v4.4s, v2.4h, #0 \n" + "prfm pldl1keep, [%0, 448] \n" "ushll2 v1.4s, v1.8h, #0 \n" "ushll2 v2.4s, v2.8h, #0 \n" "mul v3.4s, v0.4s, v3.4s \n" diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc index 951518926f..5203b57c69 100644 --- a/third_party/libyuv/source/row_win.cc +++ b/third_party/libyuv/source/row_win.cc @@ -10,9 +10,9 @@ #include "libyuv/row.h" -// This module is for Visual C 32/64 bit and clangcl 32 bit +// This module is for Visual C 32/64 bit #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) + !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) #if defined(_M_X64) #include <emmintrin.h> @@ -29,9 +29,9 @@ extern "C" { // Read 8 UV from 444 #define READYUV444 \ - xmm0 = _mm_loadl_epi64((__m128i*)u_buf); \ + xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ u_buf += 8; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ @@ -39,9 +39,9 @@ extern "C" { // Read 8 UV from 444, With 8 Alpha. #define READYUVA444 \ - xmm0 = _mm_loadl_epi64((__m128i*)u_buf); \ + xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ u_buf += 8; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ @@ -51,10 +51,10 @@ extern "C" { // Read 4 UV from 422, upsample to 8 UV. #define READYUV422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ u_buf += 4; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ @@ -62,10 +62,10 @@ extern "C" { // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ u_buf += 4; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ @@ -74,24 +74,21 @@ extern "C" { a_buf += 8; // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(yuvconstants) \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm2 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ - xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ - xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ - xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ - xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm0 = _mm_adds_epi16(xmm0, xmm4); \ - xmm1 = _mm_adds_epi16(xmm1, xmm4); \ - xmm2 = _mm_adds_epi16(xmm2, xmm4); \ - xmm0 = _mm_srai_epi16(xmm0, 6); \ - xmm1 = _mm_srai_epi16(xmm1, 6); \ - xmm2 = _mm_srai_epi16(xmm2, 6); \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ +#define YUVTORGB(yuvconstants) \ + xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80)); \ + xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ + xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \ + xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \ + xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \ + xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \ + xmm0 = _mm_adds_epi16(xmm4, xmm0); \ + xmm1 = _mm_subs_epi16(xmm4, xmm1); \ + xmm2 = _mm_adds_epi16(xmm4, xmm2); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ xmm2 = _mm_packus_epi16(xmm2, xmm2); // Store 8 ARGB values. @@ -112,7 +109,7 @@ void I422ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - __m128i xmm0, xmm1, xmm2, xmm4; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { @@ -132,7 +129,7 @@ void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - __m128i xmm0, xmm1, xmm2, xmm4, xmm5; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUVA422 @@ -150,7 +147,7 @@ void I444ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - __m128i xmm0, xmm1, xmm2, xmm4; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { @@ -170,7 +167,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - __m128i xmm0, xmm1, xmm2, xmm4, xmm5; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUVA444 @@ -247,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, // 7 bit fixed point 0.5. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; - -static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; +// 8 bit fixed point 0.5, for bias of UV. +static const ulvec8 kBiasUV128 = { + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { @@ -1427,7 +1424,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, } } -__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1440,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1499,7 +1496,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1512,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUVJ128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v @@ -1573,7 +1570,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, } #ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1586,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kAddUV128 + vbroadcastf128 ymm5, xmmword ptr kBiasUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1641,7 +1638,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1654,7 +1651,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - vbroadcastf128 ymm5, xmmword ptr kAddUVJ128 + vbroadcastf128 ymm5, xmmword ptr kBiasUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToVJ vbroadcastf128 ymm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v @@ -1709,7 +1706,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, } #endif // HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { @@ -1719,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v @@ -1767,7 +1764,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1780,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kBGRAToV movdqa xmm7, xmmword ptr kBGRAToU sub edi, edx // stride from u to v @@ -1839,7 +1836,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1852,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kABGRToV movdqa xmm7, xmmword ptr kABGRToU sub edi, edx // stride from u to v @@ -1911,7 +1908,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1924,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width - movdqa xmm5, xmmword ptr kAddUV128 + movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kRGBAToV movdqa xmm7, xmmword ptr kRGBAToU sub edi, edx // stride from u to v @@ -1986,14 +1983,14 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, // Read 16 UV from 444 #define READYUV444_AVX2 \ - __asm { \ - __asm vmovdqu xmm0, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ - __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} @@ -2001,12 +1998,12 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, // Read 16 UV from 444. With 16 Alpha. #define READYUVA444_AVX2 \ __asm { \ - __asm vmovdqu xmm0, [esi] /* U */ \ + __asm vmovdqu xmm3, [esi] /* U */ \ __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ - __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ @@ -2017,123 +2014,122 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 \ - __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm { \ + __asm vmovq xmm3, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. #define READYUVA422_AVX2 \ - __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ + __asm { \ + __asm vmovq xmm3, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vmovdqu xmm5, [ebp] /* A */ \ __asm vpermq ymm5, ymm5, 0xd8 \ __asm lea ebp, [ebp + 16]} // Read 8 UV from NV12, upsample to 16 UV. #define READNV12_AVX2 \ - __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 UV from NV21, upsample to 16 UV. #define READNV21_AVX2 \ - __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm { \ + __asm vmovdqu xmm3, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. #define READYUY2_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* YUY2 */ \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* YUY2 */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ - __asm vmovdqu ymm0, [eax] /* UV */ \ - __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ + __asm vmovdqu ymm3, [eax] /* UV */ \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \ __asm lea eax, [eax + 32]} // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. #define READUYVY_AVX2 \ - __asm { \ - __asm vmovdqu ymm4, [eax] /* UYVY */ \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* UYVY */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ - __asm vmovdqu ymm0, [eax] /* UV */ \ - __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ + __asm vmovdqu ymm3, [eax] /* UV */ \ + __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \ __asm lea eax, [eax + 32]} // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_AVX2(YuvConstants) \ - __asm { \ - __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ - __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ - __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ - __asm vpsubw ymm2, ymm3, ymm2 \ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ - __asm vpsubw ymm1, ymm3, ymm1 \ - __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ - __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \ + __asm { \ + __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ - __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ - __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ + __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \ + __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \ + __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \ + __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \ + __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \ + __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \ + __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \ + __asm vpaddw ymm4, ymm3, ymm4 \ + __asm vpaddsw ymm0, ymm0, ymm4 \ + __asm vpsubsw ymm1, ymm4, ymm1 \ + __asm vpaddsw ymm2, ymm2, ymm4 \ __asm vpsraw ymm0, ymm0, 6 \ __asm vpsraw ymm1, ymm1, 6 \ __asm vpsraw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ - __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ - __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ - } + __asm vpackuswb ymm0, ymm0, ymm0 \ + __asm vpackuswb ymm1, ymm1, ymm1 \ + __asm vpackuswb ymm2, ymm2, ymm2} // Store 16 ARGB values. #define STOREARGB_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ + __asm { \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ - __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ __asm vmovdqu 0[edx], ymm1 \ __asm vmovdqu 32[edx], ymm0 \ __asm lea edx, [edx + 64]} // Store 16 RGBA values. #define STORERGBA_AVX2 \ - __asm { \ - __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ + __asm { \ + __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ + __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ - __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ + __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ + __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ __asm vmovdqu [edx], ymm0 \ __asm vmovdqu [edx + 32], ymm1 \ __asm lea edx, [edx + 64]} @@ -2480,11 +2476,11 @@ __declspec(naked) void I422ToRGBARow_AVX2( // Read 8 UV from 444. #define READYUV444 \ - __asm { \ - __asm movq xmm0, qword ptr [esi] /* U */ \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} @@ -2492,10 +2488,10 @@ __declspec(naked) void I422ToRGBARow_AVX2( // Read 4 UV from 444. With 8 Alpha. #define READYUVA444 \ __asm { \ - __asm movq xmm0, qword ptr [esi] /* U */ \ + __asm movq xmm3, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ @@ -2504,180 +2500,178 @@ __declspec(naked) void I422ToRGBARow_AVX2( // Read 4 UV from 422, upsample to 8 UV. #define READYUV422 \ - __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ + __asm { \ + __asm movd xmm3, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ - __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ + __asm { \ + __asm movd xmm3, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm3, xmm1 /* UV */ \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm movq xmm5, qword ptr [ebp] /* A */ \ __asm lea ebp, [ebp + 8]} // Read 4 UV from NV12, upsample to 8 UV. #define READNV12 \ - __asm { \ - __asm movq xmm0, qword ptr [esi] /* UV */ \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 VU from NV21, upsample to 8 UV. #define READNV21 \ - __asm { \ - __asm movq xmm0, qword ptr [esi] /* UV */ \ + __asm { \ + __asm movq xmm3, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ - __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ + __asm pshufb xmm3, xmmword ptr kShuffleNV21 \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. #define READYUY2 \ - __asm { \ - __asm movdqu xmm4, [eax] /* YUY2 */ \ + __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ - __asm movdqu xmm0, [eax] /* UV */ \ - __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ + __asm movdqu xmm3, [eax] /* UV */ \ + __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \ __asm lea eax, [eax + 16]} // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. #define READUYVY \ - __asm { \ - __asm movdqu xmm4, [eax] /* UYVY */ \ + __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ - __asm movdqu xmm0, [eax] /* UV */ \ - __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ + __asm movdqu xmm3, [eax] /* UV */ \ + __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \ __asm lea eax, [eax + 16]} // Convert 8 pixels: 8 UV and 8 Y. #define YUVTORGB(YuvConstants) \ - __asm { \ - __asm movdqa xmm1, xmm0 \ - __asm movdqa xmm2, xmm0 \ - __asm movdqa xmm3, xmm0 \ - __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ - __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ - __asm psubw xmm0, xmm1 \ - __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ - __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ - __asm psubw xmm1, xmm2 \ - __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ - __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ - __asm psubw xmm2, xmm3 \ + __asm { \ + __asm psubb xmm3, xmmword ptr kBiasUV128 \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm paddsw xmm0, xmm4 /* B += Y */ \ - __asm paddsw xmm1, xmm4 /* G += Y */ \ - __asm paddsw xmm2, xmm4 /* R += Y */ \ + __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \ + __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \ + __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \ + __asm pmaddubsw xmm0, xmm3 \ + __asm pmaddubsw xmm1, xmm3 \ + __asm pmaddubsw xmm2, xmm3 \ + __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \ + __asm paddw xmm4, xmm3 \ + __asm paddsw xmm0, xmm4 \ + __asm paddsw xmm2, xmm4 \ + __asm psubsw xmm4, xmm1 \ + __asm movdqa xmm1, xmm4 \ __asm psraw xmm0, 6 \ __asm psraw xmm1, 6 \ __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ } // Store 8 ARGB values. #define STOREARGB \ - __asm { \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm5 /* RA */ \ + __asm { \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm0 \ __asm movdqu 16[edx], xmm1 \ __asm lea edx, [edx + 32]} // Store 8 BGRA values. #define STOREBGRA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm0 /* GB */ \ - __asm punpcklbw xmm5, xmm2 /* AR */ \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32]} // Store 8 RGBA values. #define STORERGBA \ - __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm2 /* GR */ \ - __asm punpcklbw xmm5, xmm0 /* AB */ \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32]} // Store 8 RGB24 values. #define STORERGB24 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ __asm lea edx, [edx + 24]} // Store 8 RGB565 values. #define STORERGB565 \ - __asm {/* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ - __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ - __asm movdqa xmm2, xmm0 /* G */ \ - __asm pslld xmm0, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm0, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm0, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm0, xmm3 /* BGR */ \ - __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ - __asm movdqa xmm2, xmm1 /* G */ \ - __asm pslld xmm1, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm1, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm1, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm1, xmm3 /* BGR */ \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ __asm packssdw xmm0, xmm1 \ - __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ __asm lea edx, [edx + 16]} // 8 pixels. @@ -4347,13 +4341,13 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time. -__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4442,7 +4436,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width pcmpeqb xmm3, xmm3 // generate mask 0xff000000 @@ -4487,7 +4481,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax @@ -4581,7 +4575,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax @@ -4752,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 B values + packuswb xmm0, xmm0 // 8 B values movdqu xmm5, [eax] // G movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values movdqu xmm5, [eax] // R movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values + packuswb xmm5, xmm5 // 8 R values movdqu xmm6, [eax] // A movdqu xmm1, [eax + 16] psrld xmm6, 24 @@ -4817,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm7 // B - phaddsw xmm6, xmm1 // G - psraw xmm0, 6 // B - psraw xmm6, 6 // G - packuswb xmm0, xmm0 // 8 B values - packuswb xmm6, xmm6 // 8 G values - punpcklbw xmm0, xmm6 // 8 BG values + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values movdqu xmm1, [eax] // R movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 - phaddsw xmm1, xmm7 // R + phaddsw xmm1, xmm7 // R movdqu xmm6, [eax] // A movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 phaddsw xmm6, xmm7 // A - psraw xmm1, 6 // R - psraw xmm6, 6 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A packuswb xmm1, xmm1 // 8 R values packuswb xmm6, xmm6 // 8 A values punpcklbw xmm1, xmm6 // 8 RA values @@ -4878,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, convertloop: movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm5 // first 2 pixels - pmulhuw xmm0, xmm2 // pixel * scale >> 16 + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm5 // next 2 pixels + punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 - pmullw xmm0, xmm3 // * interval_size + pmullw xmm0, xmm3 // * interval_size movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 - pand xmm7, xmm6 // mask alpha - paddw xmm0, xmm4 // + interval_size / 2 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 paddw xmm1, xmm4 packuswb xmm0, xmm1 por xmm0, xmm7 @@ -4907,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, int width, uint32_t value) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width movd xmm2, [esp + 16] // value punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 @@ -4918,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 @@ -4937,29 +4931,29 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - pxor xmm5, xmm5 // constant 0 + pxor xmm5, xmm5 // constant 0 convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 @@ -4977,14 +4971,14 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4992,11 +4986,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, jl convertloop49 convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb + src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -5007,11 +5001,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, jl convertloop19 convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb0 + movd xmm0, [eax] // read 1 pixels from src_argb lea eax, [eax + 4] movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb + src_argb1 movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -5026,23 +5020,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb0 - src_argb1 + psubusb xmm0, xmm1 // src_argb - src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -5056,20 +5050,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm1, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 lea esi, [esi + 32] @@ -5077,8 +5071,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, vpunpckhbw ymm1, ymm1, ymm1 // high 4 vpunpcklbw ymm2, ymm3, ymm5 // low 4 vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -5094,19 +5088,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 lea esi, [esi + 32] @@ -5124,21 +5118,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -5165,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y0 - mov esi, [esp + 8 + 8] // src_y1 + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 mov edi, [esp + 8 + 12] // src_y2 mov edx, [esp + 8 + 16] // dst_sobelx mov ecx, [esp + 8 + 20] // width @@ -5176,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5221,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_y0 - mov esi, [esp + 4 + 8] // src_y1 + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 mov edx, [esp + 4 + 12] // dst_sobely mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5230,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5275,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5284,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, pslld xmm5, 24 // 0xff000000 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely @@ -5323,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5357,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax pcmpeqb xmm5, xmm5 // alpha 255 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 @@ -5535,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] @@ -5577,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, test edx, 15 jne l4b - // 4 pixel loop + // 4 pixel loop l4: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. lea eax, [eax + 16] @@ -5623,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movd xmm2, dword ptr [eax] // 1 argb pixel lea eax, [eax + 4] @@ -5657,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, mov esi, [esp + 16] // stride mov edx, [esp + 20] // dst_argb mov ecx, [esp + 24] // pointer to uv_dudv - movq xmm2, qword ptr [ecx] // uv + movq xmm2, qword ptr [ecx] // uv movq xmm7, qword ptr [ecx + 8] // dudv mov ecx, [esp + 28] // width shl esi, 16 // 4, stride @@ -5666,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, sub ecx, 4 jl l4b - // setup for 4 pixel loop + // setup for 4 pixel loop pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm5, xmm5, 0 // dup 4, stride movdqa xmm0, xmm2 // x0, y0, x1, y1 @@ -5678,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, addps xmm3, xmm4 addps xmm4, xmm4 // dudv *= 4 - // 4 pixel loop + // 4 pixel loop l4: cvttps2dq xmm0, xmm2 // x, y float to int first 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2 packssdw xmm0, xmm1 // x, y as 8 shorts pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right + pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right + pshufd xmm0, xmm0, 0x39 // shift right movd xmm1, [eax + esi] // read pixel 0 movd xmm6, [eax + edi] // read pixel 1 punpckldq xmm1, xmm6 // combine pixel 0 and 1 @@ -5739,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -5749,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, je xloop100 // 0 / 256. Blend 100 / 0. sub edi, esi cmp eax, 128 - je xloop50 // 128 /256 is 0.50. Blend 50 / 50. + je xloop50 // 128 /256 is 0.50. Blend 50 / 50. vmovd xmm0, eax // high fraction 0..255 neg eax @@ -5776,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, vpaddw ymm0, ymm0, ymm4 vpsrlw ymm1, ymm1, 8 vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutates + vpackuswb ymm0, ymm0, ymm1 // unmutates vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] sub ecx, 32 @@ -5817,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi - // Dispatch to specialized filters if applicable. + // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 /256. Blend 100 / 0. cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. movd xmm0, eax // high fraction 0..255 neg eax @@ -5846,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, movdqu xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - psubb xmm0, xmm4 // bias image by -128 + psubb xmm0, xmm4 // bias image by -128 psubb xmm1, xmm4 movdqa xmm2, xmm5 movdqa xmm3, xmm5 @@ -5895,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, const uint8_t* shuffler, int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler movdqu xmm5, [ecx] mov ecx, [esp + 16] // width @@ -5922,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, const uint8_t* shuffler, int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. mov ecx, [esp + 16] // width @@ -5960,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u mov edx, [esp + 8 + 12] // src_v mov edi, [esp + 8 + 16] // dst_frame mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U + movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV + punpcklbw xmm2, xmm3 // UV movdqu xmm0, [eax] // Y lea eax, [eax + 16] movdqa xmm1, xmm0 @@ -5997,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u mov edx, [esp + 8 + 12] // src_v mov edi, [esp + 8 + 16] // dst_frame mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U + movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV + punpcklbw xmm2, xmm3 // UV movdqu xmm0, [eax] // Y movdqa xmm1, xmm2 lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY + punpcklbw xmm1, xmm0 // UYVY punpckhbw xmm2, xmm0 movdqu [edi], xmm1 movdqu [edi + 16], xmm2 @@ -6039,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - // 2 pixel loop. + // 2 pixel loop. convertloop: - // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel - // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel + // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel movq xmm0, qword ptr [eax] // BGRABGRA lea eax, [eax + 8] punpcklbw xmm0, xmm3 @@ -6091,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, const float* poly, int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 12] /* poly */ vbroadcastf128 ymm4, [ecx] // C0 vbroadcastf128 ymm5, [ecx + 16] // C1 @@ -6131,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, float scale, int width) { __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ movd xmm4, dword ptr [esp + 12] /* scale */ mov ecx, [esp + 16] /* width */ mulss xmm4, kExpBias @@ -6140,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, pxor xmm5, xmm5 sub edx, eax - // 8 pixel loop. + // 8 pixel loop. convertloop: movdqu xmm2, xmmword ptr [eax] // 8 shorts add eax, 16 @@ -6178,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, vpxor ymm5, ymm5, ymm5 sub edx, eax - // 16 pixel loop. + // 16 pixel loop. convertloop: vmovdqu ymm2, [eax] // 16 shorts add eax, 32 @@ -6188,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, vcvtdq2ps ymm2, ymm2 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. vmulps ymm2, ymm2, ymm4 - vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate vpsrld ymm2, ymm2, 13 vpackssdw ymm2, ymm2, ymm3 vmovdqu [eax + edx - 32], ymm2 @@ -6206,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, float scale, int width) { __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ vbroadcastss ymm4, [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ + mov ecx, [esp + 16] /* width */ sub edx, eax - // 16 pixel loop. + // 16 pixel loop. convertloop: vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts add eax, 32 - vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats vcvtdq2ps ymm3, ymm3 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 vmulps ymm3, ymm3, ymm4 - vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate vcvtps2ph xmm3, ymm3, 3 vmovdqu [eax + edx + 32], xmm2 vmovdqu [eax + edx + 32 + 16], xmm3 @@ -6240,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. @@ -6274,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. @@ -6309,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, __asm { push esi push edi - mov eax, [esp + 8 + 4] /* src_argb */ - mov edi, [esp + 8 + 8] /* dst_argb */ + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ mov ecx, [esp + 8 + 12] /* width */ movd xmm2, dword ptr [esp + 8 + 16] // luma table movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff @@ -6320,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, psllw xmm4, 8 pxor xmm5, xmm5 - // 4 pixel loop. + // 4 pixel loop. convertloop: movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc index 4a5dc94aaa..03b0486f76 100644 --- a/third_party/libyuv/source/scale.cc +++ b/third_party/libyuv/source/scale.cc @@ -1446,7 +1446,8 @@ void ScalePlaneUp2_Bilinear(int src_width, for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; - // TODO: Test performance of writing one row of destination at a time. + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { @@ -1459,7 +1460,7 @@ void ScalePlaneUp2_Bilinear(int src_width, // its original width, using linear interpolation. // stride is in count of uint16_t. // This is used to scale U and V planes of I210 to I410 and I212 to I412. -void ScalePlaneUp2_16_Linear(int src_width, +void ScalePlaneUp2_12_Linear(int src_width, int src_height, int dst_width, int dst_height, @@ -1476,21 +1477,21 @@ void ScalePlaneUp2_16_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 +#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSSE3; + ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 +#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; + ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_NEON +#ifdef HAS_SCALEROWUP2LINEAR_12_NEON if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; + ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; } #endif @@ -1513,6 +1514,102 @@ void ScalePlaneUp2_16_Linear(int src_width, // its original size, using bilinear interpolation. // stride is in count of uint16_t. // This is used to scale U and V planes of I010 to I410 and I012 to I412. +void ScalePlaneUp2_12_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +void ScalePlaneUp2_16_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + void ScalePlaneUp2_16_Bilinear(int src_width, int src_height, int dst_width, @@ -1530,7 +1627,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 if (TestCpuFlag(kCpuHasSSSE3)) { Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3; } @@ -1945,6 +2042,17 @@ void ScalePlane_16(const uint16_t* src, dst_stride, src, dst); return; } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1981,13 +2089,13 @@ void ScalePlane_12(const uint16_t* src, } if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { - ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, + ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && (filtering == kFilterBilinear || filtering == kFilterBox)) { - ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, + ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } diff --git a/third_party/libyuv/source/scale_any.cc b/third_party/libyuv/source/scale_any.cc index d30f583366..965749c415 100644 --- a/third_party/libyuv/source/scale_any.cc +++ b/third_party/libyuv/source/scale_any.cc @@ -656,14 +656,22 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, uint8_t) #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 -SUH2LANY(ScaleRowUp2_Linear_16_Any_SSSE3, - ScaleRowUp2_Linear_16_SSSE3, +#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 +SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, + ScaleRowUp2_Linear_12_SSSE3, ScaleRowUp2_Linear_16_C, 15, uint16_t) #endif +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 +SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, + ScaleRowUp2_Linear_16_SSE2, + ScaleRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2LINEAR_AVX2 SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, ScaleRowUp2_Linear_AVX2, @@ -672,11 +680,19 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, uint8_t) #endif +#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 +SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2, + ScaleRowUp2_Linear_12_AVX2, + ScaleRowUp2_Linear_16_C, + 31, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, ScaleRowUp2_Linear_16_AVX2, ScaleRowUp2_Linear_16_C, - 31, + 15, uint16_t) #endif @@ -688,6 +704,14 @@ SUH2LANY(ScaleRowUp2_Linear_Any_NEON, uint8_t) #endif +#ifdef HAS_SCALEROWUP2LINEAR_12_NEON +SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON, + ScaleRowUp2_Linear_12_NEON, + ScaleRowUp2_Linear_16_C, + 15, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2LINEAR_16_NEON SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, ScaleRowUp2_Linear_16_NEON, @@ -744,14 +768,22 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3, - ScaleRowUp2_Bilinear_16_SSSE3, +#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, + ScaleRowUp2_Bilinear_12_SSSE3, ScaleRowUp2_Bilinear_16_C, 15, uint16_t) #endif +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3, + ScaleRowUp2_Bilinear_16_SSE2, + ScaleRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, ScaleRowUp2_Bilinear_SSSE3, @@ -768,6 +800,14 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, uint8_t) #endif +#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2, + ScaleRowUp2_Bilinear_12_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, ScaleRowUp2_Bilinear_16_AVX2, @@ -784,11 +824,19 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, uint8_t) #endif +#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON, + ScaleRowUp2_Bilinear_12_NEON, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2BILINEAR_16_NEON SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, ScaleRowUp2_Bilinear_16_NEON, ScaleRowUp2_Bilinear_16_C, - 15, + 7, uint16_t) #endif @@ -860,7 +908,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, ScaleUVRowUp2_Linear_NEON, ScaleUVRowUp2_Linear_C, - 7, + 15, uint8_t) #endif @@ -868,7 +916,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON, ScaleUVRowUp2_Linear_16_NEON, ScaleUVRowUp2_Linear_16_C, - 7, + 15, uint16_t) #endif @@ -966,7 +1014,7 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON, ScaleUVRowUp2_Bilinear_16_NEON, ScaleUVRowUp2_Bilinear_16_C, - 3, + 7, uint16_t) #endif diff --git a/third_party/libyuv/source/scale_gcc.cc b/third_party/libyuv/source/scale_gcc.cc index f03903f0be..279c5e4020 100644 --- a/third_party/libyuv/source/scale_gcc.cc +++ b/third_party/libyuv/source/scale_gcc.cc @@ -17,8 +17,7 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // Offsets for source bytes 0 to 9 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, @@ -950,8 +949,8 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 -void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr, +#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 +void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( @@ -1000,8 +999,8 @@ void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 -void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, +#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 +void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -1045,11 +1044,11 @@ void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, "paddw %%xmm3,%%xmm5 \n" // near+far "paddw %%xmm1,%%xmm1 \n" // 2*near "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm4,%%xmm1 \n" // 3*near+far (1, lo) - "paddw %%xmm5,%%xmm3 \n" // 3*near+far (1, hi) + "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) + "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - // xmm4 xmm1 xmm0 xmm2 - // xmm5 xmm2 xmm1 xmm3 + // xmm0 xmm2 + // xmm1 xmm3 "movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm1,%%xmm5 \n" @@ -1099,6 +1098,166 @@ void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, } #endif +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 +void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packssdw %%xmm1,%%xmm0 \n" + "pshufd $0b11011000,%%xmm0,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + "paddd %%xmm0,%%xmm2 \n" // near+far (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 2(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) + "paddd %%xmm2,%%xmm4 \n" // near+far (lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packssdw %%xmm0,%%xmm4 \n" + "pshufd $0b11011000,%%xmm4,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packssdw %%xmm2,%%xmm5 \n" + "pshufd $0b11011000,%%xmm4,%%xmm4 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + #ifdef HAS_SCALEROWUP2LINEAR_SSSE3 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -1352,8 +1511,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 -void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, +#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 +void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( @@ -1402,8 +1561,8 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 -void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, +#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 +void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -1466,6 +1625,139 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, } #endif +#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 +void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 +void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + // Reads 16xN bytes and produces 16 shorts at a time. void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, @@ -2522,7 +2814,6 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( - "vpxor %%xmm5,%%xmm5,%%xmm5 \n" "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 @@ -2532,11 +2823,8 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0011000022330000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1122000033440000 - - "vpunpcklwd %%ymm5,%%ymm0,%%ymm0 \n" // 00112233 (32b, 1u1v) - "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" // 11223344 (32b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) @@ -2564,7 +2852,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif @@ -2575,7 +2863,6 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( - "vpxor %%xmm7,%%xmm7,%%xmm7 \n" "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 @@ -2585,10 +2872,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0011000022330000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1122000033440000 - "vpunpcklwd %%ymm7,%%ymm0,%%ymm0 \n" // 00112233 (32b, 1u1v) - "vpunpcklwd %%ymm7,%%ymm1,%%ymm1 \n" // 11223344 (32b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) @@ -2600,10 +2885,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) - "vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0011000022330000 - "vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1122000033440000 - "vpunpcklwd %%ymm7,%%ymm2,%%ymm2 \n" // 00112233 (32b, 1u1v) - "vpunpcklwd %%ymm7,%%ymm3,%%ymm3 \n" // 11223344 (32b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) @@ -2652,8 +2935,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc index 41dba3e8ea..6a0d6e1b49 100644 --- a/third_party/libyuv/source/scale_neon.cc +++ b/third_party/libyuv/source/scale_neon.cc @@ -603,7 +603,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ); } -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; @@ -633,7 +633,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, ); } -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -647,7 +647,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "vmov.u16 q15, #3 \n" "1: \n" - "add %5, %0, #2 \n" "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) @@ -655,7 +654,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) - "add %5, %1, #2 \n" "vld1.16 {q2}, [%1]! \n" // 01234567 (16b) "vld1.16 {q3}, [%6]! \n" // 12345678 (16b) @@ -692,6 +690,102 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ); } +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "vmov.u16 d31, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q1}, [%3]! \n" // 12345678 (16b) + + "vmovl.u16 q2, d0 \n" // 0123 (32b) + "vmovl.u16 q3, d1 \n" // 4567 (32b) + "vmovl.u16 q4, d2 \n" // 1234 (32b) + "vmovl.u16 q5, d3 \n" // 5678 (32b) + + "vmlal.u16 q2, d2, d31 \n" + "vmlal.u16 q3, d3, d31 \n" + "vmlal.u16 q4, d0, d31 \n" + "vmlal.u16 q5, d1, d31 \n" + + "vrshrn.u32 d0, q4, #2 \n" + "vrshrn.u32 d1, q5, #2 \n" + "vrshrn.u32 d2, q2, #2 \n" + "vrshrn.u32 d3, q3, #2 \n" + + "vst2.16 {q0, q1}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "vmov.u16 d31, #3 \n" + "vmov.u32 q14, #3 \n" + + "1: \n" + "vld1.16 {d0}, [%0]! \n" // 0123 (16b) + "vld1.16 {d1}, [%5]! \n" // 1234 (16b) + "vmovl.u16 q2, d0 \n" // 0123 (32b) + "vmovl.u16 q3, d1 \n" // 1234 (32b) + "vmlal.u16 q2, d1, d31 \n" + "vmlal.u16 q3, d0, d31 \n" + + "vld1.16 {d0}, [%1]! \n" // 0123 (16b) + "vld1.16 {d1}, [%6]! \n" // 1234 (16b) + "vmovl.u16 q4, d0 \n" // 0123 (32b) + "vmovl.u16 q5, d1 \n" // 1234 (32b) + "vmlal.u16 q4, d1, d31 \n" + "vmlal.u16 q5, d0, d31 \n" + + "vmovq q0, q4 \n" + "vmovq q1, q5 \n" + "vmla.u32 q4, q2, q14 \n" + "vmla.u32 q5, q3, q14 \n" + "vmla.u32 q2, q0, q14 \n" + "vmla.u32 q3, q1, q14 \n" + + "vrshrn.u32 d1, q4, #4 \n" + "vrshrn.u32 d0, q5, #4 \n" + "vrshrn.u32 d3, q2, #4 \n" + "vrshrn.u32 d2, q3, #4 \n" + + "vst2.16 {d0, d1}, [%2]! \n" // store + "vst2.16 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #8 \n" // 4 sample -> 8 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", + "d31" // Clobber List + ); +} + void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc index 22fedcb5a4..8656fec7fa 100644 --- a/third_party/libyuv/source/scale_neon64.cc +++ b/third_party/libyuv/source/scale_neon64.cc @@ -630,7 +630,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ); } -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; @@ -661,7 +661,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, ); } -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -721,6 +721,106 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ); } +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b) + "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b) + "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b) + + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd) + "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even) + + "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far + "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) + "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far + "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd) + + "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "movi v31.4h, #3 \n" + "movi v30.4s, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 0123 (16b) + "ldr d1, [%2], #8 \n" // 1234 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b) + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" // 0123 (16b) + "ldr d1, [%3], #8 \n" // 1234 (16b) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b) + "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) + "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) + + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" + "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) + "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) + "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) + "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) + + "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far + + "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1 + "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2 + + "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { @@ -888,8 +988,8 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) - "mov v0.4s, v4.4s \n" - "mov v1.4s, v5.4s \n" + "mov v0.16b, v4.16b \n" + "mov v1.16b, v5.16b \n" "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) diff --git a/third_party/libyuv/source/scale_uv.cc b/third_party/libyuv/source/scale_uv.cc index 7b977912f9..d9a314453e 100644 --- a/third_party/libyuv/source/scale_uv.cc +++ b/third_party/libyuv/source/scale_uv.cc @@ -746,7 +746,8 @@ void ScaleUVBilinearUp2(int src_width, for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; - // TODO: Test performance of writing one row of destination at a time. + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { @@ -851,7 +852,8 @@ void ScaleUVBilinearUp2_16(int src_width, for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; - // TODO: Test performance of writing one row of destination at a time. + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc index c5fc86f3e9..ea1f95c6c3 100644 --- a/third_party/libyuv/source/scale_win.cc +++ b/third_party/libyuv/source/scale_win.cc @@ -16,8 +16,9 @@ namespace libyuv { extern "C" { #endif -// This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for 32 bit Visual C x86 +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + !defined(__clang__) && defined(_M_IX86) // Offsets for source bytes 0 to 9 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, diff --git a/third_party/libyuv/unit_test/color_test.cc b/third_party/libyuv/unit_test/color_test.cc index a81ab19a86..e2d037ff79 100644 --- a/third_party/libyuv/unit_test/color_test.cc +++ b/third_party/libyuv/unit_test/color_test.cc @@ -22,8 +22,7 @@ namespace libyuv { // TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB. // Port to Visual C and other CPUs -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define ERROR_FULL 5 #define ERROR_J420 4 #else @@ -32,7 +31,11 @@ namespace libyuv { #endif #define ERROR_R 1 #define ERROR_G 1 -#define ERROR_B 3 +#ifdef LIBYUV_UNLIMITED_DATA +#define ERROR_B 1 +#else +#define ERROR_B 18 +#endif #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \ TEST_F(LibYUVColorTest, TESTNAME) { \ @@ -498,7 +501,11 @@ TEST_F(LibYUVColorTest, TestYUV) { YUVToRGB(240, 0, 0, &r1, &g1, &b1); EXPECT_EQ(57, r1); EXPECT_EQ(255, g1); +#ifdef LIBYUV_UNLIMITED_DATA + EXPECT_EQ(3, b1); +#else EXPECT_EQ(5, b1); +#endif for (int i = 0; i < 256; ++i) { YUVToRGBReference(i, 128, 128, &r0, &g0, &b0); @@ -655,9 +662,9 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) { int y = RANDOM256(y2); YUVJToRGBReference(y, u, v, &r0, &g0, &b0); YUVJToRGB(y, u, v, &r1, &g1, &b1); - EXPECT_NEAR(r0, r1, 1); - EXPECT_NEAR(g0, g1, 1); - EXPECT_NEAR(b0, b1, 1); + EXPECT_NEAR(r0, r1, ERROR_R); + EXPECT_NEAR(g0, g1, ERROR_G); + EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; @@ -687,8 +694,7 @@ TEST_F(LibYUVColorTest, TestFullYUVH) { YUVHToRGB(y, u, v, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); - // TODO(crbug.com/libyuv/862): Reduce the errors in the B channel. - EXPECT_NEAR(b0, b1, 15); + EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; @@ -716,9 +722,9 @@ TEST_F(LibYUVColorTest, TestFullYUVF) { int y = RANDOM256(y2); YUVFToRGBReference(y, u, v, &r0, &g0, &b0); YUVFToRGB(y, u, v, &r1, &g1, &b1); - EXPECT_NEAR(r0, r1, 5); - EXPECT_NEAR(g0, g1, 5); - EXPECT_NEAR(b0, b1, 5); + EXPECT_NEAR(r0, r1, ERROR_R); + EXPECT_NEAR(g0, g1, ERROR_G); + EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; @@ -748,8 +754,7 @@ TEST_F(LibYUVColorTest, TestFullYUVU) { YUVUToRGB(y, u, v, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); - // TODO(crbug.com/libyuv/863): Reduce the errors in the B channel. - EXPECT_NEAR(b0, b1, 18); + EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; diff --git a/third_party/libyuv/unit_test/compare_test.cc b/third_party/libyuv/unit_test/compare_test.cc index bd99cdd3ac..c29562cb86 100644 --- a/third_party/libyuv/unit_test/compare_test.cc +++ b/third_party/libyuv/unit_test/compare_test.cc @@ -344,7 +344,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { uint32_t h1 = 0; - const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31; + const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 63) & ~63; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 255u, kMaxWidth); diff --git a/third_party/libyuv/unit_test/convert_test.cc b/third_party/libyuv/unit_test/convert_test.cc index 8638a84c13..3855838381 100644 --- a/third_party/libyuv/unit_test/convert_test.cc +++ b/third_party/libyuv/unit_test/convert_test.cc @@ -55,14 +55,14 @@ namespace libyuv { static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ - "SRC_SUBSAMP_X unsupported"); \ + "SRC_SUBSAMP_X unsupported"); \ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ - "SRC_SUBSAMP_Y unsupported"); \ + "SRC_SUBSAMP_Y unsupported"); \ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ - "DST_SUBSAMP_X unsupported"); \ + "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ - "DST_SUBSAMP_Y unsupported"); \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + "DST_SUBSAMP_Y unsupported"); \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ @@ -137,7 +137,7 @@ namespace libyuv { DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ - benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \ + benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ benchmark_width_, _Unaligned, +, 1, SRC_DEPTH) \ @@ -183,8 +183,8 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12) #define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ W1280, N, NEG, OFF, PN, OFF_U, OFF_V) \ - TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##_##PN##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##To##PN##N) { \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSizeUV = \ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ @@ -270,7 +270,7 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12) SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \ SUBSAMP_Y) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1, \ _Any, +, 0, PN, OFF_U, OFF_V) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \ @@ -318,7 +318,7 @@ int I400ToNV21(const uint8_t* src_y, "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ "DST_SUBSAMP_Y unsupported"); \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ @@ -386,7 +386,7 @@ int I400ToNV21(const uint8_t* src_y, DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \ + DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \ @@ -424,7 +424,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ "DST_SUBSAMP_Y unsupported"); \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ @@ -493,7 +493,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1, \ + DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1, \ SRC_DEPTH) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ @@ -537,7 +537,7 @@ TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12) "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ "DST_SUBSAMP_Y unsupported"); \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ @@ -606,7 +606,7 @@ TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12) DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ - DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \ + DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \ @@ -654,7 +654,7 @@ TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ @@ -702,7 +702,7 @@ TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, _Any, +, 0) \ + YALIGN, benchmark_width_ + 1, _Any, +, 0) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Unaligned, +, 1) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ @@ -769,12 +769,14 @@ TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1) TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) +TESTPLANARTOB(I420, 2, 2, AB30, 4, 4, 1) +TESTPLANARTOB(H420, 2, 2, AB30, 4, 4, 1) #endif #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, W1280, N, NEG, OFF, ATTEN) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ @@ -821,7 +823,7 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \ + YALIGN, benchmark_width_ + 1, _Any, +, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Unaligned, +, 1, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ @@ -928,6 +930,8 @@ TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1) +TESTQPLANARTOB(F420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(F420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1) @@ -938,6 +942,8 @@ TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(J422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1) +TESTQPLANARTOB(F422Alpha, 2, 1, ARGB, 4, 4, 1) +TESTQPLANARTOB(F422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1) @@ -948,6 +954,8 @@ TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(J444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1) +TESTQPLANARTOB(F444Alpha, 1, 1, ARGB, 4, 4, 1) +TESTQPLANARTOB(F444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1) @@ -956,7 +964,7 @@ TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1) #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ BPP_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideB = kWidth * BPP_B; \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ @@ -1009,7 +1017,7 @@ TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1) #define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_ - 4, _Any, +, 0) \ + benchmark_width_ + 1, _Any, +, 0) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_, _Unaligned, +, 1) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ @@ -1064,7 +1072,7 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2) #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ @@ -1111,7 +1119,7 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2) #define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ - 4, _Any, +, 0) \ + benchmark_width_ + 1, _Any, +, 0) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Unaligned, +, 1) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ @@ -1134,6 +1142,7 @@ TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2) TESTATOPLANAR(I400, 1, 1, I420, 2, 2) TESTATOPLANAR(J400, 1, 1, J420, 2, 2) TESTATOPLANAR(RAW, 3, 1, I420, 2, 2) +TESTATOPLANAR(RAW, 3, 1, J420, 2, 2) TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2) TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2) TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2) @@ -1145,7 +1154,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1) #define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \ SUBSAMP_Y, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ @@ -1191,7 +1200,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1) #define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ - benchmark_width_ - 4, _Any, +, 0) \ + benchmark_width_ + 1, _Any, +, 0) \ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Unaligned, +, 1) \ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ @@ -1208,152 +1217,166 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) -#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, W1280, N, NEG, OFF) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = \ - (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideB = \ - (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c, 1, kStrideB* kHeightB); \ - memset(dst_argb_opt, 101, kStrideB* kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, kWidth, \ - NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB, \ - kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ +#define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ + EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ + const int kWidth = W1280; \ + const int kHeight = benchmark_height_; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, \ + kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ + align_buffer_page_end(dst_argb_opt, \ + kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ + for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ + src_argb[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c, 1, kStrideB* kHeightB); \ + memset(dst_argb_opt, 101, kStrideB* kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \ + kStrideB, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, \ + (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \ + } \ + for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, \ + TYPE_B, EPP_B, STRIDE_B, HEIGHT_B) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ + for (int times = 0; times < benchmark_iterations_; ++times) { \ + const int kWidth = (fastrand() & 63) + 1; \ + const int kHeight = (fastrand() & 31) + 1; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ + align_buffer_page_end(dst_argb_c, \ + kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ + align_buffer_page_end(dst_argb_opt, \ + kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ + for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ + src_argb[i] = 0xfe; \ + } \ + memset(dst_argb_c, 123, kStrideB* kHeightB); \ + memset(dst_argb_opt, 123, kStrideB* kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c, \ + kStrideB, kWidth, kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt, \ + kStrideB, kWidth, kHeight); \ + for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } \ } -#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \ - STRIDE_B, HEIGHT_B) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ - for (int times = 0; times < benchmark_iterations_; ++times) { \ - const int kWidth = (fastrand() & 63) + 1; \ - const int kHeight = (fastrand() & 31) + 1; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = \ - (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideB = \ - (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c, 123, kStrideB* kHeightB); \ - memset(dst_argb_opt, 123, kStrideB* kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_c, kStrideB, kWidth, \ - kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \ - kHeight); \ - for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - } \ - } - -#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_ - 4, _Any, +, 0) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, _Invert, -, 0) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, _Opt, +, 0) \ - TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B) - -TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1) -TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1) +#define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ + EPP_B, STRIDE_B, HEIGHT_B) \ + TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_ + 1, _Any, +, 0) \ + TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \ + TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_, _Invert, -, 0) \ + TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0) \ + TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ + EPP_B, STRIDE_B, HEIGHT_B) + +TESTATOB(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOB(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) #endif -TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1) +TESTATOB(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1) #endif -TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1) +TESTATOB(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1) -TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1) +TESTATOB(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) +TESTATOB(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #endif -TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) #endif -TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1) -TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1) -TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1) -TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1) -TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1) -TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1) -TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1) -TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1) -TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1) -TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1) -TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1) -TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1) #endif -TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1) -TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1) -TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1) // 4 -TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1) -TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1) -TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1) -TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1) -TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1) -TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1) -TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1) -TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1) -TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1) -TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1) -TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1) -TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1) -TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1) -TESTATOB(RGB24, 3, 3, 1, RGB24Mirror, 3, 3, 1) -TESTATOB(RAW, 3, 3, 1, J400, 1, 1, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1) // 4 +TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1) +TESTATOB(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1) +TESTATOB(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1) +TESTATOB(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1) +TESTATOB(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1) +TESTATOB(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1) +TESTATOB(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) #endif -TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1) -TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1) -TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1) -TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1) +TESTATOB(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +TESTATOB(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ @@ -1423,7 +1446,7 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1) #define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_ - 4, _Any, +, 0) \ + HEIGHT_B, benchmark_width_ + 1, _Any, +, 0) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ @@ -1437,35 +1460,39 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1) TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) #endif -#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF) \ +#define TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, \ + OFF) \ TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ const int kStrideA = \ - (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideA* kHeightA); \ - align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + align_buffer_page_end(src_argb, \ + kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ + align_buffer_page_end(dst_argb_opt, \ + kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ + for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_c, 1, kStrideA* kHeightA); \ memset(dst_argb_opt, 101, kStrideA* kHeightA); \ MaskCpuFlags(disable_cpu_flags_); \ - FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_c, kStrideA, kWidth, \ - NEG kHeight); \ + FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c, \ + kStrideA, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_opt, kStrideA, kWidth, \ - NEG kHeight); \ + FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_opt, \ + kStrideA, kWidth, NEG kHeight); \ } \ MaskCpuFlags(disable_cpu_flags_); \ - FMT_ATOB(dst_argb_c, kStrideA, dst_argb_c, kStrideA, kWidth, NEG kHeight); \ + FMT_ATOB((TYPE_A*)dst_argb_c, kStrideA, (TYPE_A*)dst_argb_c, kStrideA, \ + kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ - FMT_ATOB(dst_argb_opt, kStrideA, dst_argb_opt, kStrideA, kWidth, \ - NEG kHeight); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + FMT_ATOB((TYPE_A*)dst_argb_opt, kStrideA, (TYPE_A*)dst_argb_opt, kStrideA, \ + kWidth, NEG kHeight); \ + for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ @@ -1474,18 +1501,20 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) free_aligned_buffer_page_end(dst_argb_opt); \ } -#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, _Any, +, \ - 0) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Unaligned, \ - +, 1) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Opt, +, 0) +#define TESTSYM(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A) \ + TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ + 1, \ + _Any, +, 0) \ + TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ + _Unaligned, +, 1) \ + TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ + _Opt, +, 0) -TESTSYM(ARGBToARGB, 4, 4, 1) -TESTSYM(ARGBToBGRA, 4, 4, 1) -TESTSYM(ARGBToABGR, 4, 4, 1) -TESTSYM(BGRAToARGB, 4, 4, 1) -TESTSYM(ABGRToARGB, 4, 4, 1) +TESTSYM(ARGBToARGB, uint8_t, 4, 4, 1) +TESTSYM(ARGBToBGRA, uint8_t, 4, 4, 1) +TESTSYM(ARGBToABGR, uint8_t, 4, 4, 1) +TESTSYM(BGRAToARGB, uint8_t, 4, 4, 1) +TESTSYM(ABGRToARGB, uint8_t, 4, 4, 1) +TESTSYM(AB64ToAR64, uint16_t, 4, 4, 1) TEST_F(LibYUVConvertTest, Test565) { SIMD_ALIGNED(uint8_t orig_pixels[256][4]); @@ -2349,7 +2378,11 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) { // Test result matches known hash value. uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381); +#ifdef LIBYUV_UNLIMITED_DATA + EXPECT_EQ(dst_argb_hash, 3900633302u); +#else EXPECT_EQ(dst_argb_hash, 2355976473u); +#endif free_aligned_buffer_page_end(dst_argb); } @@ -2658,7 +2691,7 @@ TEST_F(LibYUVConvertTest, TestDither) { #define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, W1280, N, NEG, OFF, FMT_C, BPP_C) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ @@ -2711,7 +2744,7 @@ TEST_F(LibYUVConvertTest, TestDither) { #define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, FMT_C, BPP_C) \ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \ + YALIGN, benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ @@ -2784,11 +2817,12 @@ TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12) TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12) // Transitive tests. A to B to C is same as A to C. +// Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere. #define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ W1280, N, NEG, OFF, FMT_C, BPP_C) \ - TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) { \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ @@ -2805,23 +2839,23 @@ TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12) src_v[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, dst_argb_b + OFF, \ - kStrideB, kWidth, NEG kHeight); \ - } \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, dst_argb_b + OFF, kStrideB, \ + kWidth, NEG kHeight); \ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ const int kStrideC = kWidth * BPP_C; \ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ - FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ - src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideC, \ - kWidth, NEG kHeight); \ - /* Convert B to C */ \ - FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \ - kWidth, kHeight); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ + src_v + OFF, kStrideUV, dst_argb_c + OFF, \ + kStrideC, kWidth, NEG kHeight); \ + /* Convert B to C */ \ + FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \ + kStrideC, kWidth, kHeight); \ + } \ for (int i = 0; i < kStrideC * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \ } \ @@ -2836,7 +2870,7 @@ TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12) #define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ FMT_C, BPP_C) \ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \ + benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C) \ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ @@ -2844,26 +2878,28 @@ TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12) TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C) -TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4) TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4) -TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4) -TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4) -TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ARGB, 4) -TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4) -TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4) -TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4) +TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4) +TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3) +TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB24, 3) TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4) -TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4) -TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4) +TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4) TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3) +TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4) TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3) -TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3) -TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4) -TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4) -TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3) -TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3) +TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4) +TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4) +TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ABGR, 4) TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3) +TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RGB24, 3) TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4) +TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3) +TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4) +TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3) +TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4) +TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4) +TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4) +TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2) @@ -2899,8 +2935,8 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4) #define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \ - TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) { \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ const int kSizeUV = \ @@ -2919,25 +2955,25 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4) src_v[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B( \ - src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \ - dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \ - } \ + FMT_PLANAR##To##FMT_B( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \ + dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ const int kStrideC = kWidth * BPP_C; \ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ - FMT_PLANAR##To##FMT_C( \ - src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \ - dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN); \ - /* Convert B to C */ \ - FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \ - kWidth, kHeight); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_C( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \ + dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN); \ + /* Convert B to C */ \ + FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \ + kStrideC, kWidth, kHeight); \ + } \ for (int i = 0; i < kStrideC * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \ } \ @@ -2953,7 +2989,7 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4) #define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ FMT_C, BPP_C) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ - benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C, 0) \ + benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C, 0) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Unaligned, +, 1, FMT_C, BPP_C, 0) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ @@ -3000,8 +3036,8 @@ TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) #define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \ OFF, FMT_C, BPP_C) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_##FMT_C##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##To##FMT_C##N) { \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ @@ -3009,21 +3045,21 @@ TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ MemRandomize(src_argb_a + OFF, kStrideA * kHeight); \ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \ - kWidth, NEG kHeight); \ - } \ + FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \ + kWidth, NEG kHeight); \ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ const int kStrideC = kWidth * BPP_C; \ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ - FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC, \ - kWidth, NEG kHeight); \ - /* Convert B to C */ \ - FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \ - kWidth, kHeight); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC, \ + kWidth, NEG kHeight); \ + /* Convert B to C */ \ + FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \ + kStrideC, kWidth, kHeight); \ + } \ for (int i = 0; i < kStrideC * kHeight; i += 4) { \ EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]); \ EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]); \ @@ -3038,7 +3074,7 @@ TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) #define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, \ - benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \ + benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \ _Unaligned, +, 1, FMT_C, BPP_C) \ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \ @@ -3161,91 +3197,457 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { } #endif // HAS_ABGRTOAR30ROW_AVX2 +// Provide matrix wrappers for 12 bit YUV +#define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \ + I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \ + I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) + +#define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \ + I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define I410ToABGR(a, b, c, d, e, f, g, h, i, j) \ + I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define H410ToARGB(a, b, c, d, e, f, g, h, i, j) \ + I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) +#define H410ToABGR(a, b, c, d, e, f, g, h, i, j) \ + I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) +#define U410ToARGB(a, b, c, d, e, f, g, h, i, j) \ + I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) +#define U410ToABGR(a, b, c, d, e, f, g, h, i, j) \ + I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) +#define I410ToAR30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define I410ToAB30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define H410ToAR30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) +#define H410ToAB30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) +#define U410ToAR30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) +#define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) + // TODO(fbarchard): Fix clamping issue affected by U channel. -#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \ - TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - const int kBpc = 2; \ - align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ - align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & 0x3ff); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & 0x3ff); \ - reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & 0x3ff); \ - } \ - memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ - memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B( \ - reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ - reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \ - reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ - dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B( \ - reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ - reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \ - reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ - dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ - EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ - } \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - } - -#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN) \ - TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \ - TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Unaligned, +, 1, 1) \ - TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Invert, -, 0, 0) \ - TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Opt, +, 0, 0) - -TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1) -TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1) -TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1) -TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1) -TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1) -TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1) -TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1) -TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1) -TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1) -TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1) -TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1) -TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1) +#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \ + BPP_B, ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = W1280; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + const int kBpc = 2; \ + align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ + align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & FMT_MASK); \ + reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & FMT_MASK); \ + } \ + memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ + reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \ + reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ + dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ + reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \ + reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \ + dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ + } \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \ + BPP_B, ALIGN, YALIGN) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ + ALIGN, YALIGN, benchmark_width_ + 1, _Any, +, 0, 0) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ + ALIGN, YALIGN, benchmark_width_, _Unaligned, +, 1, 1) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ + ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0) \ + TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ + ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0) + +// These conversions are only optimized for x86 +#if defined(ENABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) +TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ARGB, 4, 4, 1) +TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ABGR, 4, 4, 1) +TESTPLANAR16TOB(I012, 2, 2, 0xfff, ARGB, 4, 4, 1) + #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1) -TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1) -TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1) -TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1) -TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1) -TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1) -TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1) -TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1) -TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1) -TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1) -TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1) -TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1) -#endif +TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1) +TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1) +TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1) +#endif // LITTLE_ENDIAN_ONLY_TEST +#endif // ENABLE_SLOW_TESTS + +#define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, W1280, N, NEG, OFF, ATTEN, S_DEPTH) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = W1280; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + const int kBpc = 2; \ + align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \ + align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \ + align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \ + align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + reinterpret_cast<uint16_t*>(src_y + OFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + reinterpret_cast<uint16_t*>(src_a + OFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + reinterpret_cast<uint16_t*>(src_u + OFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + reinterpret_cast<uint16_t*>(src_v + OFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + } \ + memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + OFF), kWidth, \ + reinterpret_cast<uint16_t*>(src_u + OFF), kStrideUV, \ + reinterpret_cast<uint16_t*>(src_v + OFF), kStrideUV, \ + reinterpret_cast<uint16_t*>(src_a + OFF), kWidth, \ + dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \ + ATTEN); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast<uint16_t*>(src_y + OFF), kWidth, \ + reinterpret_cast<uint16_t*>(src_u + OFF), kStrideUV, \ + reinterpret_cast<uint16_t*>(src_v + OFF), kStrideUV, \ + reinterpret_cast<uint16_t*>(src_a + OFF), kWidth, \ + dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \ + } \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(src_a); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Unaligned, +, 1, 0, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Premult, +, 0, 1, S_DEPTH) + +#define I010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define I010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define J010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define J010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define F010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define F010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define H010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define H010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define U010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define U010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define V010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define V010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define I210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define I210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define J210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define J210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define F210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define F210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define H210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define H210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define U210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define U210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define V210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define V210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define I410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define I410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define J410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define J410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define F410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define F410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define H410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define H410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define U410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define U410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define V410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define V410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) + +// These conversions are only optimized for x86 +#if defined(ENABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) +TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(I010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(J010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(J010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(H010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(H010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(F010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(F010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(U010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(U010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(V010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(V010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(I210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(J210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(J210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(H210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(H210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(F210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(F210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(U210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(U210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(V210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(V210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(I410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(I410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(J410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(J410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(H410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(H410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(F410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(F410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(U410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(U410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(V410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(V410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +#endif // ENABLE_SLOW_TESTS + +#define TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = W1280; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; \ + const int kBpc = 2; \ + align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ + align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = \ + (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + reinterpret_cast<uint16_t*>(src_uv + SOFF)[i] = \ + (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \ + } \ + memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ + reinterpret_cast<uint16_t*>(src_uv + SOFF), \ + kStrideUV, dst_argb_c + DOFF, kStrideB, kWidth, \ + NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \ + reinterpret_cast<uint16_t*>(src_uv + SOFF), \ + kStrideUV, dst_argb_opt + DOFF, kStrideB, kWidth, \ + NEG kHeight); \ + } \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTBIPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, S_DEPTH) \ + TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH) \ + TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Unaligned, +, 1, 1, S_DEPTH) \ + TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ + TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) + +#define P010ToARGB(a, b, c, d, e, f, g, h) \ + P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P210ToARGB(a, b, c, d, e, f, g, h) \ + P210ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P010ToAR30(a, b, c, d, e, f, g, h) \ + P010ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P210ToAR30(a, b, c, d, e, f, g, h) \ + P210ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) + +#define P012ToARGB(a, b, c, d, e, f, g, h) \ + P012ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P212ToARGB(a, b, c, d, e, f, g, h) \ + P212ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P012ToAR30(a, b, c, d, e, f, g, h) \ + P012ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P212ToAR30(a, b, c, d, e, f, g, h) \ + P212ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) + +#define P016ToARGB(a, b, c, d, e, f, g, h) \ + P016ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P216ToARGB(a, b, c, d, e, f, g, h) \ + P216ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P016ToAR30(a, b, c, d, e, f, g, h) \ + P016ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P216ToAR30(a, b, c, d, e, f, g, h) \ + P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) + +#if defined(ENABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) +TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10) +TESTBIPLANAR16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10) +TESTBIPLANAR16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12) +TESTBIPLANAR16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12) +TESTBIPLANAR16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16) +TESTBIPLANAR16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTBIPLANAR16TOB(P010, 2, 2, AR30, 4, 4, 1, 10) +TESTBIPLANAR16TOB(P210, 2, 1, AR30, 4, 4, 1, 10) +TESTBIPLANAR16TOB(P012, 2, 2, AR30, 4, 4, 1, 12) +TESTBIPLANAR16TOB(P212, 2, 1, AR30, 4, 4, 1, 12) +TESTBIPLANAR16TOB(P016, 2, 2, AR30, 4, 4, 1, 16) +TESTBIPLANAR16TOB(P216, 2, 1, AR30, 4, 4, 1, 16) +#endif // LITTLE_ENDIAN_ONLY_TEST +#endif // defined(ENABLE_SLOW_TESTS) static int Clamp(int y) { if (y < 0) { diff --git a/third_party/libyuv/unit_test/cpu_test.cc b/third_party/libyuv/unit_test/cpu_test.cc index 7264de0801..4035cf2bbc 100644 --- a/third_party/libyuv/unit_test/cpu_test.cc +++ b/third_party/libyuv/unit_test/cpu_test.cc @@ -72,26 +72,98 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { #endif } -TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) { -#if defined(__aarch64__) - printf("Arm64 build\n"); +TEST_F(LibYUVBaseTest, TestCompilerMacros) { + // Tests all macros used in public headers. +#ifdef __ATOMIC_RELAXED + printf("__ATOMIC_RELAXED %d\n", __ATOMIC_RELAXED); #endif -#if defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON) - printf("Neon build enabled\n"); +#ifdef __cplusplus + printf("__cplusplus %ld\n", __cplusplus); #endif -#if defined(__x86_64__) || defined(_M_X64) - printf("x64 build\n"); +#ifdef __clang_major__ + printf("__clang_major__ %d\n", __clang_major__); +#endif +#ifdef __clang_minor__ + printf("__clang_minor__ %d\n", __clang_minor__); +#endif +#ifdef __GNUC__ + printf("__GNUC__ %d\n", __GNUC__); +#endif +#ifdef __GNUC_MINOR__ + printf("__GNUC_MINOR__ %d\n", __GNUC_MINOR__); +#endif +#ifdef __i386__ + printf("__i386__ %d\n", __i386__); +#endif +#ifdef __mips + printf("__mips %d\n", __mips); +#endif +#ifdef __mips_isa_rev + printf("__mips_isa_rev %d\n", __mips_isa_rev); +#endif +#ifdef __x86_64__ + printf("__x86_64__ %d\n", __x86_64__); #endif #ifdef _MSC_VER printf("_MSC_VER %d\n", _MSC_VER); #endif -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(GCC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ - defined(VISUALC_HAS_AVX2)) - printf("Has AVX2 1\n"); -#else - printf("Has AVX2 0\n"); -// If compiler does not support AVX2, the following function not expected: +#ifdef __aarch64__ + printf("__aarch64__ %d\n", __aarch64__); +#endif +#ifdef __APPLE__ + printf("__APPLE__ %d\n", __APPLE__); +#endif +#ifdef __arm__ + printf("__arm__ %d\n", __arm__); +#endif +#ifdef __clang__ + printf("__clang__ %d\n", __clang__); +#endif +#ifdef __CLR_VER + printf("__CLR_VER %d\n", __CLR_VER); +#endif +#ifdef __CYGWIN__ + printf("__CYGWIN__ %d\n", __CYGWIN__); +#endif +#ifdef __llvm__ + printf("__llvm__ %d\n", __llvm__); +#endif +#ifdef __mips_msa + printf("__mips_msa %d\n", __mips_msa); +#endif +#ifdef __native_client__ + printf("__native_client__ %d\n", __native_client__); +#endif +#ifdef __pic__ + printf("__pic__ %d\n", __pic__); +#endif +#ifdef __pnacl__ + printf("__pnacl__ %d\n", __pnacl__); +#endif +#ifdef _M_IX86 + printf("_M_IX86 %d\n", _M_IX86); +#endif +#ifdef _M_X64 + printf("_M_X64 %d\n", _M_X64); +#endif +#ifdef _MIPS_ARCH_LOONGSON3A + printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A); +#endif +#ifdef _WIN32 + printf("_WIN32 %d\n", _WIN32); +#endif +#ifdef GG_LONGLONG + printf("GG_LONGLONG %d\n", GG_LONGLONG); +#endif +#ifdef INT_TYPES_DEFINED + printf("INT_TYPES_DEFINED\n"); +#endif +#ifdef __has_feature + printf("__has_feature\n"); +#if __has_feature(memory_sanitizer) + printf("__has_feature(memory_sanitizer) %d\n", + __has_feature(memory_sanitizer)); +#endif #endif } diff --git a/third_party/libyuv/unit_test/planar_test.cc b/third_party/libyuv/unit_test/planar_test.cc index fd1755cdca..5c60842136 100644 --- a/third_party/libyuv/unit_test/planar_test.cc +++ b/third_party/libyuv/unit_test/planar_test.cc @@ -155,7 +155,7 @@ static int TestAttenuateI(int width, } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) { - int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_, + int max_diff = TestAttenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 2); @@ -228,7 +228,7 @@ static int TestUnattenuateI(int width, } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) { - int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_, + int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 2); @@ -1076,7 +1076,7 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) { #define TESTTERP(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, W1280, TERP, \ N, NEG, OFF) \ TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideA = \ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ @@ -1108,7 +1108,7 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) { } #define TESTINTERPOLATE(TERP) \ - TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ - 1, TERP, _Any, +, 0) \ + TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ + 1, TERP, _Any, +, 0) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0) @@ -1174,7 +1174,7 @@ static int TestBlend(int width, TEST_F(LibYUVPlanarTest, ARGBBlend_Any) { int max_diff = - TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_, + TestBlend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_LE(max_diff, 1); } @@ -1280,7 +1280,7 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) { disable_cpu_flags_, benchmark_cpu_info_, +1, 1); } TEST_F(LibYUVPlanarTest, BlendPlane_Any) { - TestBlendPlane(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_, + TestBlendPlane(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); } TEST_F(LibYUVPlanarTest, BlendPlane_Invert) { @@ -1375,7 +1375,7 @@ TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) { // TODO(fbarchard): DISABLED because _Any uses C. Avoid C and re-enable. TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) { - TestI420Blend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_, + TestI420Blend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); } TEST_F(LibYUVPlanarTest, I420Blend_Invert) { @@ -1524,7 +1524,7 @@ static int TestMultiply(int width, } TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) { - int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_, + int max_diff = TestMultiply(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); @@ -1599,7 +1599,7 @@ static int TestAdd(int width, TEST_F(LibYUVPlanarTest, ARGBAdd_Any) { int max_diff = - TestAdd(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, + TestAdd(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } @@ -1672,7 +1672,7 @@ static int TestSubtract(int width, } TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) { - int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_, + int max_diff = TestSubtract(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); @@ -1745,7 +1745,7 @@ static int TestSobel(int width, TEST_F(LibYUVPlanarTest, ARGBSobel_Any) { int max_diff = - TestSobel(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, + TestSobel(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } @@ -1818,7 +1818,7 @@ static int TestSobelToPlane(int width, } TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) { - int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_, + int max_diff = TestSobelToPlane(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); @@ -1890,7 +1890,7 @@ static int TestSobelXY(int width, } TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) { - int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_, + int max_diff = TestSobelXY(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); @@ -1966,29 +1966,35 @@ static int TestBlur(int width, return max_diff; } +#if defined(ENABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) +#define DISABLED_ARM(name) name +#else +#define DISABLED_ARM(name) DISABLED_##name +#endif + static const int kBlurSize = 55; -TEST_F(LibYUVPlanarTest, ARGBBlur_Any) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Any)) { int max_diff = - TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, + TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize); EXPECT_LE(max_diff, 1); } -TEST_F(LibYUVPlanarTest, ARGBBlur_Unaligned) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Unaligned)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize); EXPECT_LE(max_diff, 1); } -TEST_F(LibYUVPlanarTest, ARGBBlur_Invert) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Invert)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize); EXPECT_LE(max_diff, 1); } -TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Opt)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize); @@ -1996,35 +2002,35 @@ TEST_F(LibYUVPlanarTest, ARGBBlur_Opt) { } static const int kBlurSmallSize = 5; -TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Any) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Any)) { int max_diff = - TestBlur(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, + TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } -TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Unaligned) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Unaligned)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize); EXPECT_LE(max_diff, 1); } -TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Invert) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Invert)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } -TEST_F(LibYUVPlanarTest, ARGBBlurSmall_Opt) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Opt)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } -TEST_F(LibYUVPlanarTest, TestARGBPolynomial) { +TEST_F(LibYUVPlanarTest, DISABLED_ARM(TestARGBPolynomial)) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]); @@ -2398,8 +2404,7 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) { } TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(dst_pixels_opt, kPixels); align_buffer_page_end(dst_pixels_c, kPixels); @@ -2427,8 +2432,7 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { } TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(orig_pixels, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 4); align_buffer_page_end(dst_pixels_c, kPixels * 4); @@ -2505,7 +2509,7 @@ static int TestARGBRect(int width, } TEST_F(LibYUVPlanarTest, ARGBRect_Any) { - int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_, + int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 4); EXPECT_EQ(0, max_diff); @@ -2533,7 +2537,7 @@ TEST_F(LibYUVPlanarTest, ARGBRect_Opt) { } TEST_F(LibYUVPlanarTest, SetPlane_Any) { - int max_diff = TestARGBRect(benchmark_width_ - 1, benchmark_height_, + int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_EQ(0, max_diff); @@ -2561,35 +2565,25 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) { } TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; - align_buffer_page_end(src_pixels, kPixels * 2); - align_buffer_page_end(tmp_pixels_u, kPixels); - align_buffer_page_end(tmp_pixels_v, kPixels); + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_u, kPixels); + align_buffer_page_end(src_pixels_v, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 2); align_buffer_page_end(dst_pixels_c, kPixels * 2); - MemRandomize(src_pixels, kPixels * 2); - MemRandomize(tmp_pixels_u, kPixels); - MemRandomize(tmp_pixels_v, kPixels); + MemRandomize(src_pixels_u, kPixels); + MemRandomize(src_pixels_v, kPixels); MemRandomize(dst_pixels_opt, kPixels * 2); MemRandomize(dst_pixels_c, kPixels * 2); MaskCpuFlags(disable_cpu_flags_); - SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, - tmp_pixels_v, benchmark_width_, benchmark_width_, - benchmark_height_); - MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, + MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_, dst_pixels_c, benchmark_width_ * 2, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); - SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, - tmp_pixels_v, benchmark_width_, benchmark_width_, - benchmark_height_); - for (int i = 0; i < benchmark_iterations_; ++i) { - MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, + MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_, dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, benchmark_height_); } @@ -2598,60 +2592,127 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } - free_aligned_buffer_page_end(src_pixels); - free_aligned_buffer_page_end(tmp_pixels_u); - free_aligned_buffer_page_end(tmp_pixels_v); + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); +} + +// 16 bit channel split and merge +TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_u, kPixels * 2); + align_buffer_page_end(src_pixels_v, kPixels * 2); + align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2); + align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2); + MemRandomize(src_pixels_u, kPixels * 2); + MemRandomize(src_pixels_v, kPixels * 2); + MemRandomize(dst_pixels_opt, kPixels * 2 * 2); + MemRandomize(dst_pixels_c, kPixels * 2 * 2); + + MaskCpuFlags(disable_cpu_flags_); + MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_, + (const uint16_t*)src_pixels_v, benchmark_width_, + (uint16_t*)dst_pixels_c, benchmark_width_ * 2, + benchmark_width_, benchmark_height_, 12); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_, + (const uint16_t*)src_pixels_v, benchmark_width_, + (uint16_t*)dst_pixels_opt, benchmark_width_ * 2, + benchmark_width_, benchmark_height_, 12); + } + + for (int i = 0; i < kPixels * 2 * 2; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 2); - align_buffer_page_end(tmp_pixels_u, kPixels); - align_buffer_page_end(tmp_pixels_v, kPixels); - align_buffer_page_end(dst_pixels_opt, kPixels * 2); - align_buffer_page_end(dst_pixels_c, kPixels * 2); + align_buffer_page_end(dst_pixels_u_c, kPixels); + align_buffer_page_end(dst_pixels_v_c, kPixels); + align_buffer_page_end(dst_pixels_u_opt, kPixels); + align_buffer_page_end(dst_pixels_v_opt, kPixels); MemRandomize(src_pixels, kPixels * 2); - MemRandomize(tmp_pixels_u, kPixels); - MemRandomize(tmp_pixels_v, kPixels); - MemRandomize(dst_pixels_opt, kPixels * 2); - MemRandomize(dst_pixels_c, kPixels * 2); + MemRandomize(dst_pixels_u_c, kPixels); + MemRandomize(dst_pixels_v_c, kPixels); + MemRandomize(dst_pixels_u_opt, kPixels); + MemRandomize(dst_pixels_v_opt, kPixels); MaskCpuFlags(disable_cpu_flags_); - SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, - tmp_pixels_v, benchmark_width_, benchmark_width_, - benchmark_height_); - MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, - dst_pixels_c, benchmark_width_ * 2, benchmark_width_, - benchmark_height_); + SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_c, + benchmark_width_, dst_pixels_v_c, benchmark_width_, + benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { - SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, - benchmark_width_, tmp_pixels_v, benchmark_width_, + SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_opt, + benchmark_width_, dst_pixels_v_opt, benchmark_width_, benchmark_width_, benchmark_height_); } - MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, - dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, - benchmark_height_); - for (int i = 0; i < kPixels * 2; ++i) { - EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + for (int i = 0; i < kPixels; ++i) { + EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]); + EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]); } free_aligned_buffer_page_end(src_pixels); - free_aligned_buffer_page_end(tmp_pixels_u); - free_aligned_buffer_page_end(tmp_pixels_v); - free_aligned_buffer_page_end(dst_pixels_opt); - free_aligned_buffer_page_end(dst_pixels_c); + free_aligned_buffer_page_end(dst_pixels_u_c); + free_aligned_buffer_page_end(dst_pixels_v_c); + free_aligned_buffer_page_end(dst_pixels_u_opt); + free_aligned_buffer_page_end(dst_pixels_v_opt); +} + +// 16 bit channel split +TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels, kPixels * 2 * 2); + align_buffer_page_end(dst_pixels_u_c, kPixels * 2); + align_buffer_page_end(dst_pixels_v_c, kPixels * 2); + align_buffer_page_end(dst_pixels_u_opt, kPixels * 2); + align_buffer_page_end(dst_pixels_v_opt, kPixels * 2); + MemRandomize(src_pixels, kPixels * 2 * 2); + MemRandomize(dst_pixels_u_c, kPixels * 2); + MemRandomize(dst_pixels_v_c, kPixels * 2); + MemRandomize(dst_pixels_u_opt, kPixels * 2); + MemRandomize(dst_pixels_v_opt, kPixels * 2); + + MaskCpuFlags(disable_cpu_flags_); + SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, + (uint16_t*)dst_pixels_u_c, benchmark_width_, + (uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_, + benchmark_height_, 10); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, + (uint16_t*)dst_pixels_u_opt, benchmark_width_, + (uint16_t*)dst_pixels_v_opt, benchmark_width_, + benchmark_width_, benchmark_height_, 10); + } + + for (int i = 0; i < kPixels * 2; ++i) { + EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]); + EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]); + } + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_u_c); + free_aligned_buffer_page_end(dst_pixels_v_c); + free_aligned_buffer_page_end(dst_pixels_u_opt); + free_aligned_buffer_page_end(dst_pixels_v_opt); } TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) { // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 2); align_buffer_page_end(dst_pixels_opt, kPixels * 2); align_buffer_page_end(dst_pixels_c, kPixels * 2); @@ -2681,7 +2742,7 @@ TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) { TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) { // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 3); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2730,7 +2791,7 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) { TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 3); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2777,8 +2838,7 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { } TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2832,8 +2892,7 @@ TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) { } TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2887,8 +2946,7 @@ TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) { } TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2938,8 +2996,7 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) { } TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2987,11 +3044,166 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +// Merge 4 channels +#define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ + TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \ + const int kWidth = W1280; \ + const int kPixels = kWidth * benchmark_height_; \ + align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ + align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_a, kPixels * sizeof(STYPE) + OFF); \ + memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \ + memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \ + STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \ + STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \ + STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \ + STYPE* src_pixels_a = reinterpret_cast<STYPE*>(src_memory_a + OFF); \ + DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \ + DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \ + MaskCpuFlags(disable_cpu_flags_); \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4, \ + kWidth, NEG benchmark_height_, DEPTH); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, src_pixels_a, kWidth, dst_pixels_opt, kWidth * 4, \ + kWidth, NEG benchmark_height_, DEPTH); \ + } \ + for (int i = 0; i < kPixels * 4; ++i) { \ + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_memory_r); \ + free_aligned_buffer_page_end(src_memory_g); \ + free_aligned_buffer_page_end(src_memory_b); \ + free_aligned_buffer_page_end(src_memory_a); \ + free_aligned_buffer_page_end(dst_memory_c); \ + free_aligned_buffer_page_end(dst_memory_opt); \ + } + +// Merge 3 channel RGB into 4 channel XRGB with opaque alpha +#define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ + TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) { \ + const int kWidth = W1280; \ + const int kPixels = kWidth * benchmark_height_; \ + align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ + align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \ + memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \ + STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \ + STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \ + STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \ + DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \ + DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \ + MaskCpuFlags(disable_cpu_flags_); \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth, \ + NEG benchmark_height_, DEPTH); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, NULL, 0, dst_pixels_opt, kWidth * 4, kWidth, \ + NEG benchmark_height_, DEPTH); \ + } \ + for (int i = 0; i < kPixels * 4; ++i) { \ + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_memory_r); \ + free_aligned_buffer_page_end(src_memory_g); \ + free_aligned_buffer_page_end(src_memory_b); \ + free_aligned_buffer_page_end(dst_memory_c); \ + free_aligned_buffer_page_end(dst_memory_opt); \ + } + +#define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \ + TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \ + TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ + 1) \ + TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ + TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) \ + TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, \ + 0) \ + TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ + 1) \ + TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ + TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) + +TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 10) +TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 12) +TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 16) +TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 10) +TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 12) +TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16) + +#define TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ + TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \ + const int kWidth = W1280; \ + const int kPixels = kWidth * benchmark_height_; \ + align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ + align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \ + STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \ + STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \ + DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \ + DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \ + memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \ + memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \ + MaskCpuFlags(disable_cpu_flags_); \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, dst_pixels_c, kWidth * 4, kWidth, \ + NEG benchmark_height_, DEPTH); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, dst_pixels_opt, kWidth * 4, kWidth, \ + NEG benchmark_height_, DEPTH); \ + } \ + for (int i = 0; i < kPixels * 4; ++i) { \ + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_memory_r); \ + free_aligned_buffer_page_end(src_memory_g); \ + free_aligned_buffer_page_end(src_memory_b); \ + free_aligned_buffer_page_end(dst_memory_c); \ + free_aligned_buffer_page_end(dst_memory_opt); \ + } + +#define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \ + TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \ + TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ + 1) \ + TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ + TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) + +TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10) +TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12) +TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16) + // TODO(fbarchard): improve test for platforms and cpu detect #ifdef HAS_MERGEUVROW_16_AVX2 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { // Round count up to multiple of 16 const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + align_buffer_page_end(src_pixels_u, kPixels * 2); align_buffer_page_end(src_pixels_v, kPixels * 2); align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2); @@ -3035,8 +3247,9 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { // TODO(fbarchard): Improve test for more platforms. #ifdef HAS_MULTIPLYROW_16_AVX2 TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + // Round count up to multiple of 32 + const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31; + align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); align_buffer_page_end(dst_pixels_y_c, kPixels * 2); @@ -3072,8 +3285,7 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) { #endif // HAS_MULTIPLYROW_16_AVX2 TEST_F(LibYUVPlanarTest, Convert16To8Plane) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(dst_pixels_y_opt, kPixels); align_buffer_page_end(dst_pixels_y_c, kPixels); @@ -3152,8 +3364,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { #endif // ENABLE_ROW_TESTS TEST_F(LibYUVPlanarTest, Convert8To16Plane) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_y, kPixels); align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); align_buffer_page_end(dst_pixels_y_c, kPixels * 2); diff --git a/third_party/libyuv/unit_test/rotate_argb_test.cc b/third_party/libyuv/unit_test/rotate_argb_test.cc index 3208b66a2a..01ed69ca55 100644 --- a/third_party/libyuv/unit_test/rotate_argb_test.cc +++ b/third_party/libyuv/unit_test/rotate_argb_test.cc @@ -156,29 +156,29 @@ TEST_F(LibYUVRotateTest, RotatePlane270_Opt) { } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) { - TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, kRotate0, + TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) { - TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, kRotate90, + TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) { - TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, kRotate180, + TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) { - TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, kRotate270, + TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } diff --git a/third_party/libyuv/unit_test/rotate_test.cc b/third_party/libyuv/unit_test/rotate_test.cc index 61941e63e0..1bab584fa1 100644 --- a/third_party/libyuv/unit_test/rotate_test.cc +++ b/third_party/libyuv/unit_test/rotate_test.cc @@ -108,29 +108,29 @@ TEST_F(LibYUVRotateTest, I420Rotate270_Opt) { // Odd width tests work but disabled because they use C code and can be // tested by passing an odd width command line or environment variable. TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) { - I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, kRotate0, + I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) { - I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, kRotate90, + I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) { - I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, kRotate180, + I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) { - I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, kRotate270, + I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } @@ -225,29 +225,29 @@ TEST_F(LibYUVRotateTest, I444Rotate270_Opt) { // Odd width tests work but disabled because they use C code and can be // tested by passing an odd width command line or environment variable. TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) { - I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, kRotate0, + I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) { - I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, kRotate90, + I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) { - I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, kRotate180, + I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) { - I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, kRotate270, + I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } @@ -340,29 +340,29 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) { } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) { - NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, kRotate0, + NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) { - NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, kRotate90, + NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) { - NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_width_ - 3, benchmark_height_ - 1, kRotate180, + NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) { - NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1, - benchmark_height_ - 1, benchmark_width_ - 3, kRotate270, + NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, + benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } diff --git a/third_party/libyuv/unit_test/scale_argb_test.cc b/third_party/libyuv/unit_test/scale_argb_test.cc index ac9766124f..48ad75eafd 100644 --- a/third_party/libyuv/unit_test/scale_argb_test.cc +++ b/third_party/libyuv/unit_test/scale_argb_test.cc @@ -114,8 +114,8 @@ static int ARGBTestFilter(int src_width, return max_diff; } -static const int kTileX = 8; -static const int kTileY = 8; +static const int kTileX = 64; +static const int kTileY = 64; static int TileARGBScale(const uint8_t* src_argb, int src_stride_argb, @@ -232,7 +232,7 @@ static int ARGBClipTestFilter(int src_width, #define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom) #define SX(x, nom, denom) static_cast<int>((x / nom) * denom) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ +#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \ TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \ int diff = ARGBTestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ @@ -241,7 +241,7 @@ static int ARGBClipTestFilter(int src_width, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, ARGBScaleDownClipBy##name##_##filter) { \ + TEST_F(LibYUVScaleTest, DISABLED_##ARGBScaleDownClipBy##name##_##filter) { \ int diff = ARGBClipTestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ @@ -251,11 +251,19 @@ static int ARGBClipTestFilter(int src_width, // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, nom, denom) \ - TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, 3) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ - TEST_FACTOR1(name, Box, nom, denom, 3) +#ifdef ENABLE_SLOW_TESTS +#define TEST_FACTOR(name, nom, denom) \ + TEST_FACTOR1(, name, None, nom, denom, 0) \ + TEST_FACTOR1(, name, Linear, nom, denom, 3) \ + TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(, name, Box, nom, denom, 3) +#else +#define TEST_FACTOR(name, nom, denom) \ + TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \ + TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \ + TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(DISABLED_, name, Box, nom, denom, 3) +#endif TEST_FACTOR(2, 1, 2) TEST_FACTOR(4, 1, 4) @@ -268,7 +276,7 @@ TEST_FACTOR(3, 1, 3) #undef SX #undef DX -#define TEST_SCALETO1(name, width, height, filter, max_diff) \ +#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ @@ -282,13 +290,15 @@ TEST_FACTOR(3, 1, 3) benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, name##ClipTo##width##x##height##_##filter) { \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##name##ClipTo##width##x##height##_##filter) { \ int diff = \ ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, name##ClipFrom##width##x##height##_##filter) { \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##name##ClipFrom##width##x##height##_##filter) { \ int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_); \ @@ -296,13 +306,20 @@ TEST_FACTOR(3, 1, 3) } /// Test scale to a specified size with all 4 filters. -#define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(name, width, height, None, 0) \ - TEST_SCALETO1(name, width, height, Linear, 3) \ - TEST_SCALETO1(name, width, height, Bilinear, 3) +#ifdef ENABLE_SLOW_TESTS +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(, name, width, height, None, 0) \ + TEST_SCALETO1(, name, width, height, Linear, 3) \ + TEST_SCALETO1(, name, width, height, Bilinear, 3) +#else +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ + TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ + TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) +#endif TEST_SCALETO(ARGBScale, 1, 1) -TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */ +TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(ARGBScale, 320, 240) TEST_SCALETO(ARGBScale, 569, 480) TEST_SCALETO(ARGBScale, 640, 360) diff --git a/third_party/libyuv/unit_test/scale_test.cc b/third_party/libyuv/unit_test/scale_test.cc index d24806a661..6da6b574d1 100644 --- a/third_party/libyuv/unit_test/scale_test.cc +++ b/third_party/libyuv/unit_test/scale_test.cc @@ -259,6 +259,123 @@ static int I420TestFilter_12(int src_width, return max_diff; } +// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference. +// 0 = exact. +static int I420TestFilter_16(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (!SizeValid(src_width, src_height, dst_width, dst_height)) { + return 0; + } + + int i; + int src_width_uv = (Abs(src_width) + 1) >> 1; + int src_height_uv = (Abs(src_height) + 1) >> 1; + + int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); + int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); + + int src_stride_y = Abs(src_width); + int src_stride_uv = src_width_uv; + + align_buffer_page_end(src_y, src_y_plane_size); + align_buffer_page_end(src_u, src_uv_plane_size); + align_buffer_page_end(src_v, src_uv_plane_size); + align_buffer_page_end(src_y_16, src_y_plane_size * 2); + align_buffer_page_end(src_u_16, src_uv_plane_size * 2); + align_buffer_page_end(src_v_16, src_uv_plane_size * 2); + if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16); + uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16); + uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16); + + MemRandomize(src_y, src_y_plane_size); + MemRandomize(src_u, src_uv_plane_size); + MemRandomize(src_v, src_uv_plane_size); + + for (i = 0; i < src_y_plane_size; ++i) { + p_src_y_16[i] = src_y[i]; + } + for (i = 0; i < src_uv_plane_size; ++i) { + p_src_u_16[i] = src_u[i]; + p_src_v_16[i] = src_v[i]; + } + + int dst_width_uv = (dst_width + 1) >> 1; + int dst_height_uv = (dst_height + 1) >> 1; + + int dst_y_plane_size = (dst_width) * (dst_height); + int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); + + int dst_stride_y = dst_width; + int dst_stride_uv = dst_width_uv; + + align_buffer_page_end(dst_y_8, dst_y_plane_size); + align_buffer_page_end(dst_u_8, dst_uv_plane_size); + align_buffer_page_end(dst_v_8, dst_uv_plane_size); + align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); + align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2); + align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2); + + uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16); + uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16); + uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, + src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, + dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + for (i = 0; i < benchmark_iterations; ++i) { + I420Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv, + p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16, + dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16, + dst_stride_uv, dst_width, dst_height, f); + } + + // Expect an exact match. + int max_diff = 0; + for (i = 0; i < dst_y_plane_size; ++i) { + int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + for (i = 0; i < dst_uv_plane_size; ++i) { + int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(dst_y_8); + free_aligned_buffer_page_end(dst_u_8); + free_aligned_buffer_page_end(dst_v_8); + free_aligned_buffer_page_end(dst_y_16); + free_aligned_buffer_page_end(dst_u_16); + free_aligned_buffer_page_end(dst_v_16); + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_u); + free_aligned_buffer_page_end(src_v); + free_aligned_buffer_page_end(src_y_16); + free_aligned_buffer_page_end(src_u_16); + free_aligned_buffer_page_end(src_v_16); + + return max_diff; +} + // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int I444TestFilter(int src_width, int src_height, @@ -494,6 +611,123 @@ static int I444TestFilter_12(int src_width, return max_diff; } +// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference. +// 0 = exact. +static int I444TestFilter_16(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (!SizeValid(src_width, src_height, dst_width, dst_height)) { + return 0; + } + + int i; + int src_width_uv = Abs(src_width); + int src_height_uv = Abs(src_height); + + int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); + int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); + + int src_stride_y = Abs(src_width); + int src_stride_uv = src_width_uv; + + align_buffer_page_end(src_y, src_y_plane_size); + align_buffer_page_end(src_u, src_uv_plane_size); + align_buffer_page_end(src_v, src_uv_plane_size); + align_buffer_page_end(src_y_16, src_y_plane_size * 2); + align_buffer_page_end(src_u_16, src_uv_plane_size * 2); + align_buffer_page_end(src_v_16, src_uv_plane_size * 2); + if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16); + uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16); + uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16); + + MemRandomize(src_y, src_y_plane_size); + MemRandomize(src_u, src_uv_plane_size); + MemRandomize(src_v, src_uv_plane_size); + + for (i = 0; i < src_y_plane_size; ++i) { + p_src_y_16[i] = src_y[i]; + } + for (i = 0; i < src_uv_plane_size; ++i) { + p_src_u_16[i] = src_u[i]; + p_src_v_16[i] = src_v[i]; + } + + int dst_width_uv = dst_width; + int dst_height_uv = dst_height; + + int dst_y_plane_size = (dst_width) * (dst_height); + int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); + + int dst_stride_y = dst_width; + int dst_stride_uv = dst_width_uv; + + align_buffer_page_end(dst_y_8, dst_y_plane_size); + align_buffer_page_end(dst_u_8, dst_uv_plane_size); + align_buffer_page_end(dst_v_8, dst_uv_plane_size); + align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); + align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2); + align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2); + + uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16); + uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16); + uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, + src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, + dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + for (i = 0; i < benchmark_iterations; ++i) { + I444Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv, + p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16, + dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16, + dst_stride_uv, dst_width, dst_height, f); + } + + // Expect an exact match. + int max_diff = 0; + for (i = 0; i < dst_y_plane_size; ++i) { + int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + for (i = 0; i < dst_uv_plane_size; ++i) { + int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(dst_y_8); + free_aligned_buffer_page_end(dst_u_8); + free_aligned_buffer_page_end(dst_v_8); + free_aligned_buffer_page_end(dst_y_16); + free_aligned_buffer_page_end(dst_u_16); + free_aligned_buffer_page_end(dst_v_16); + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_u); + free_aligned_buffer_page_end(src_v); + free_aligned_buffer_page_end(src_y_16); + free_aligned_buffer_page_end(src_u_16); + free_aligned_buffer_page_end(src_v_16); + + return max_diff; +} + // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int NV12TestFilter(int src_width, int src_height, @@ -700,6 +934,20 @@ TEST_FACTOR(3, 1, 3, 0) benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \ + int diff = I420TestFilter_16( \ + benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \ + int diff = I444TestFilter_16( \ + benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) { \ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ @@ -736,6 +984,22 @@ TEST_FACTOR(3, 1, 3, 0) benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \ + int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \ + int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) { \ int diff = NV12TestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ @@ -761,7 +1025,7 @@ TEST_FACTOR(3, 1, 3, 0) #endif TEST_SCALETO(Scale, 1, 1) -TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */ +TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 569, 480) TEST_SCALETO(Scale, 640, 360) @@ -801,6 +1065,20 @@ TEST_SCALETO(Scale, 1920, 1080) disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ + TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) { \ + int diff = I420TestFilter_16(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) { \ + int diff = I444TestFilter_16(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) { \ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ diff --git a/third_party/libyuv/unit_test/scale_uv_test.cc b/third_party/libyuv/unit_test/scale_uv_test.cc index 59eeee3043..6e4649f84d 100644 --- a/third_party/libyuv/unit_test/scale_uv_test.cc +++ b/third_party/libyuv/unit_test/scale_uv_test.cc @@ -166,7 +166,7 @@ TEST_FACTOR(3, 1, 3) TEST_SCALETO1(name, width, height, Bilinear, 3) TEST_SCALETO(UVScale, 1, 1) -TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */ +TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(UVScale, 320, 240) TEST_SCALETO(UVScale, 569, 480) TEST_SCALETO(UVScale, 640, 360) diff --git a/third_party/libyuv/unit_test/unit_test.cc b/third_party/libyuv/unit_test/unit_test.cc index 85e3b7170f..e6dbc3eed6 100644 --- a/third_party/libyuv/unit_test/unit_test.cc +++ b/third_party/libyuv/unit_test/unit_test.cc @@ -26,9 +26,13 @@ unsigned int fastrand_seed = 0xfb; ABSL_FLAG(int32_t, libyuv_width, 0, "width of test image."); ABSL_FLAG(int32_t, libyuv_height, 0, "height of test image."); ABSL_FLAG(int32_t, libyuv_repeat, 0, "number of times to repeat test."); -ABSL_FLAG(int32_t, libyuv_flags, 0, +ABSL_FLAG(int32_t, + libyuv_flags, + 0, "cpu flags for reference code. 1 = C, -1 = SIMD"); -ABSL_FLAG(int32_t, libyuv_cpu_info, 0, +ABSL_FLAG(int32_t, + libyuv_cpu_info, + 0, "cpu flags for benchmark code. 1 = C, -1 = SIMD"); #else // Disable command line parameters if absl/flags disabled. diff --git a/third_party/libyuv/unit_test/unit_test.h b/third_party/libyuv/unit_test/unit_test.h index 87907fa160..580832addc 100644 --- a/third_party/libyuv/unit_test/unit_test.h +++ b/third_party/libyuv/unit_test/unit_test.h @@ -11,7 +11,7 @@ #ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT #define UNIT_TEST_UNIT_TEST_H_ -#ifdef WIN32 +#ifdef _WIN32 #include <windows.h> #else #include <sys/resource.h> diff --git a/third_party/libyuv/unit_test/video_common_test.cc b/third_party/libyuv/unit_test/video_common_test.cc index 6c6a384d41..36728ea900 100644 --- a/third_party/libyuv/unit_test/video_common_test.cc +++ b/third_party/libyuv/unit_test/video_common_test.cc @@ -29,7 +29,7 @@ static bool TestValidFourCC(uint32_t fourcc, int bpp) { !TestValidChar((fourcc >> 24) & 0xff)) { return false; } - if (bpp < 0 || bpp > 32) { + if (bpp < 0 || bpp > 64) { return false; } return true; @@ -72,6 +72,8 @@ TEST_F(LibYUVBaseTest, TestFourCC) { EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR)); EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30)); EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30)); + EXPECT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64)); + EXPECT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64)); EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG)); EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA)); diff --git a/third_party/opus/src/celt/celt_lpc.c b/third_party/opus/src/celt/celt_lpc.c index 457e7ed0d2..242e6df55e 100644 --- a/third_party/opus/src/celt/celt_lpc.c +++ b/third_party/opus/src/celt/celt_lpc.c @@ -50,7 +50,11 @@ int p #endif OPUS_CLEAR(lpc, p); +#ifdef FIXED_POINT if (ac[0] != 0) +#else + if (ac[0] > 1e-10f) +#endif { for (i = 0; i < p; i++) { /* Sum up this iteration's reflection coefficient */ @@ -73,10 +77,10 @@ int p error = error - MULT32_32_Q31(MULT32_32_Q31(r,r),error); /* Bail out once we get 30 dB gain */ #ifdef FIXED_POINT - if (error<SHR32(ac[0],10)) + if (error<=SHR32(ac[0],10)) break; #else - if (error<.001f*ac[0]) + if (error<=.001f*ac[0]) break; #endif } diff --git a/third_party/opus/src/src/opus_encoder.c b/third_party/opus/src/src/opus_encoder.c index 321bb2bb1e..253fe9e880 100644 --- a/third_party/opus/src/src/opus_encoder.c +++ b/third_party/opus/src/src/opus_encoder.c @@ -900,10 +900,10 @@ static int decide_dtx_mode(opus_int activity, /* indicates if this fr { if (!activity) { - /* The number of consecutive DTX frames should be within the allowed bounds. - Note that the allowed bound is defined in the Silk headers and assumes 20 ms - frames. As this function can be called with any frame length, a conversion to - miliseconds is done before the comparisons. */ + /* The number of consecutive DTX frames should be within the allowed bounds. + Note that the allowed bound is defined in the SILK headers and assumes 20 ms + frames. As this function can be called with any frame length, a conversion to + milliseconds is done before the comparisons. */ (*nb_no_activity_ms_Q1) += frame_size_ms_Q1; if (*nb_no_activity_ms_Q1 > NB_SPEECH_FRAMES_BEFORE_DTX*20*2) { diff --git a/third_party/pffft/pffft_unittest.cc b/third_party/pffft/pffft_unittest.cc index 559723434e..c2bf184191 100644 --- a/third_party/pffft/pffft_unittest.cc +++ b/third_party/pffft/pffft_unittest.cc @@ -68,7 +68,7 @@ void PffftValidate(int fft_size, bool complex_fft) { } for (k = 0; k < num_floats; ++k) { - ref_max = std::max(ref_max, fabs(ref[k])); + ref_max = std::max<float>(ref_max, (float) fabs(ref[k])); } // Pass 0: non canonical ordering of transform coefficients. diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_constants.h b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_constants.h index b8e2f2d581..be2028eb27 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_constants.h +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_constants.h @@ -34,7 +34,7 @@ #if defined(__FreeBSD__) && !defined(__Userspace__) #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/netinet/sctp_constants.h 365071 2020-09-01 21:19:14Z mjg $"); +__FBSDID("$FreeBSD$"); #endif #ifndef _NETINET_SCTP_CONSTANTS_H_ @@ -610,7 +610,7 @@ extern void getwintimeofday(struct timeval *tv); #define SCTP_RTO_UPPER_BOUND (60000) /* 60 sec in ms */ #define SCTP_RTO_LOWER_BOUND (1000) /* 1 sec is ms */ -#define SCTP_RTO_INITIAL (3000) /* 3 sec in ms */ +#define SCTP_RTO_INITIAL (1000) /* 1 sec in ms */ #define SCTP_INP_KILL_TIMEOUT 20 /* number of ms to retry kill of inpcb */ #define SCTP_ASOC_KILL_TIMEOUT 10 /* number of ms to retry kill of inpcb */ diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_input.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_input.c index f3c3644855..fb6e4c23eb 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_input.c +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_input.c @@ -108,57 +108,12 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, if (stcb == NULL) { SCTP_INP_RLOCK(inp); } - /* validate length */ - if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) { - op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); - sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, -#endif - vrf_id, port); - if (stcb) - *abort_no_unlock = 1; - goto outnow; - } - /* validate parameters */ + /* Validate parameters */ init = &cp->init; - if (init->initiate_tag == 0) { - /* protocol error... send abort */ - op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); - sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, -#endif - vrf_id, port); - if (stcb) - *abort_no_unlock = 1; - goto outnow; - } - if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) { - /* invalid parameter... send abort */ - op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); - sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, -#endif - vrf_id, port); - if (stcb) - *abort_no_unlock = 1; - goto outnow; - } - if (init->num_inbound_streams == 0) { - /* protocol error... send abort */ - op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); - sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, -#endif - vrf_id, port); - if (stcb) - *abort_no_unlock = 1; - goto outnow; - } - if (init->num_outbound_streams == 0) { + if ((ntohl(init->initiate_tag) == 0) || + (ntohl(init->a_rwnd) < SCTP_MIN_RWND) || + (ntohs(init->num_inbound_streams) == 0) || + (ntohs(init->num_outbound_streams) == 0)) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, @@ -514,26 +469,34 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, asoc = &stcb->asoc; asoc->peer_supports_nat = (uint8_t)nat_friendly; /* process the peer's parameters in the INIT-ACK */ - retval = sctp_process_init((struct sctp_init_chunk *)cp, stcb); - if (retval < 0) { + if (sctp_process_init((struct sctp_init_chunk *)cp, stcb) < 0) { if (op_err != NULL) { sctp_m_freem(op_err); } - return (retval); + op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); + SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_init() failed\n"); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + src, dst, sh, op_err, +#if defined(__FreeBSD__) && !defined(__Userspace__) + mflowtype, mflowid, +#endif + vrf_id, net->port); + *abort_no_unlock = 1; + return (-1); } initack_limit = offset + ntohs(cp->ch.chunk_length); /* load all addresses */ if ((retval = sctp_load_addresses_from_init(stcb, m, - (offset + sizeof(struct sctp_init_chunk)), initack_limit, - src, dst, NULL, stcb->asoc.port))) { + offset + sizeof(struct sctp_init_chunk), + initack_limit, src, dst, NULL, stcb->asoc.port)) < 0) { if (op_err != NULL) { sctp_m_freem(op_err); } op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with address parameters"); SCTPDBG(SCTP_DEBUG_INPUT1, - "Load addresses from INIT causes an abort %d\n", - retval); + "Load addresses from INIT causes an abort %d\n", + retval); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, #if defined(__FreeBSD__) && !defined(__Userspace__) @@ -1420,57 +1383,12 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, "sctp_handle_init_ack: TCB is null\n"); return (-1); } - if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) { - /* Invalid length */ - op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); - sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, - src, dst, sh, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, -#endif - vrf_id, net->port); - *abort_no_unlock = 1; - return (-1); - } init_ack = &cp->init; - /* validate parameters */ - if (init_ack->initiate_tag == 0) { - /* protocol error... send an abort */ - op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); - sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, - src, dst, sh, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, -#endif - vrf_id, net->port); - *abort_no_unlock = 1; - return (-1); - } - if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) { - /* protocol error... send an abort */ - op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); - sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, - src, dst, sh, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, -#endif - vrf_id, net->port); - *abort_no_unlock = 1; - return (-1); - } - if (init_ack->num_inbound_streams == 0) { - /* protocol error... send an abort */ - op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); - sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, - src, dst, sh, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, -#endif - vrf_id, net->port); - *abort_no_unlock = 1; - return (-1); - } - if (init_ack->num_outbound_streams == 0) { + /* Validate parameters. */ + if ((ntohl(init_ack->initiate_tag) == 0) || + (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) || + (ntohs(init_ack->num_inbound_streams) == 0) || + (ntohs(init_ack->num_outbound_streams) == 0)) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, @@ -1624,6 +1542,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, vrf_id, net->port); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 2; + SCTP_TCB_UNLOCK(stcb); return (NULL); } /* @@ -1638,9 +1557,11 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, (uint8_t *) & init_buf); if (init_cp == NULL) { /* could not pull a INIT chunk in cookie */ + SCTP_TCB_UNLOCK(stcb); return (NULL); } if (init_cp->ch.chunk_type != SCTP_INITIATION) { + SCTP_TCB_UNLOCK(stcb); return (NULL); } /* @@ -1653,9 +1574,11 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, (uint8_t *) & initack_buf); if (initack_cp == NULL) { /* could not pull INIT-ACK chunk in cookie */ + SCTP_TCB_UNLOCK(stcb); return (NULL); } if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { + SCTP_TCB_UNLOCK(stcb); return (NULL); } if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && @@ -1681,6 +1604,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 17; + SCTP_TCB_UNLOCK(stcb); return (NULL); } switch (SCTP_GET_STATE(stcb)) { @@ -1693,10 +1617,17 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, * have the right seq no's. */ /* First we must process the INIT !! */ - retval = sctp_process_init(init_cp, stcb); - if (retval < 0) { + if (sctp_process_init(init_cp, stcb) < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 3; + op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); + SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_init() failed\n"); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + src, dst, sh, op_err, +#if defined(__FreeBSD__) && !defined(__Userspace__) + mflowtype, mflowid, +#endif + vrf_id, net->port); return (NULL); } /* we have already processed the INIT so no problem */ @@ -1741,6 +1672,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_UNLOCK(so, 1); return (NULL); } @@ -1776,16 +1708,22 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, break; } /* end switch */ sctp_stop_all_cookie_timers(stcb); - /* - * We ignore the return code here.. not sure if we should - * somehow abort.. but we do have an existing asoc. This - * really should not fail. - */ - if (sctp_load_addresses_from_init(stcb, m, - init_offset + sizeof(struct sctp_init_chunk), - initack_offset, src, dst, init_src, stcb->asoc.port)) { + if ((retval = sctp_load_addresses_from_init(stcb, m, + init_offset + sizeof(struct sctp_init_chunk), + initack_offset, src, dst, init_src, stcb->asoc.port)) < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 4; + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + "Problem with address parameters"); + SCTPDBG(SCTP_DEBUG_INPUT1, + "Load addresses from INIT causes an abort %d\n", + retval); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + src, dst, sh, op_err, +#if defined(__FreeBSD__) && !defined(__Userspace__) + mflowtype, mflowid, +#endif + vrf_id, net->port); return (NULL); } /* respond with a COOKIE-ACK */ @@ -1805,6 +1743,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 6; + SCTP_TCB_UNLOCK(stcb); return (NULL); } /* If nat support, and the below and stcb is established, @@ -1830,6 +1769,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, mflowtype, mflowid, inp->fibnum, #endif vrf_id, port); + SCTP_TCB_UNLOCK(stcb); return (NULL); } if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && @@ -1859,6 +1799,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 7; + SCTP_TCB_UNLOCK(stcb); return (NULL); } if (how_indx < sizeof(asoc->cookie_how)) @@ -1901,17 +1842,35 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, } } /* process the INIT info (peer's info) */ - retval = sctp_process_init(init_cp, stcb); - if (retval < 0) { + if (sctp_process_init(init_cp, stcb) < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 9; + op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); + SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_init() failed\n"); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + src, dst, sh, op_err, +#if defined(__FreeBSD__) && !defined(__Userspace__) + mflowtype, mflowid, +#endif + vrf_id, net->port); return (NULL); } - if (sctp_load_addresses_from_init(stcb, m, - init_offset + sizeof(struct sctp_init_chunk), - initack_offset, src, dst, init_src, stcb->asoc.port)) { + if ((retval = sctp_load_addresses_from_init(stcb, m, + init_offset + sizeof(struct sctp_init_chunk), + initack_offset, src, dst, init_src, stcb->asoc.port)) < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 10; + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + "Problem with address parameters"); + SCTPDBG(SCTP_DEBUG_INPUT1, + "Load addresses from INIT causes an abort %d\n", + retval); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + src, dst, sh, op_err, +#if defined(__FreeBSD__) && !defined(__Userspace__) + mflowtype, mflowid, +#endif + vrf_id, net->port); return (NULL); } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || @@ -1933,6 +1892,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_UNLOCK(so, 1); return (NULL); } @@ -1985,19 +1945,25 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, #endif if (asoc->peer_supports_nat) { + struct sctp_tcb *local_stcb; + /* This is a gross gross hack. * Just call the cookie_new code since we * are allowing a duplicate association. * I hope this works... */ - return (sctp_process_cookie_new(m, iphlen, offset, src, dst, - sh, cookie, cookie_len, - inp, netp, init_src,notification, - auth_skipped, auth_offset, auth_len, + local_stcb = sctp_process_cookie_new(m, iphlen, offset, src, dst, + sh, cookie, cookie_len, + inp, netp, init_src,notification, + auth_skipped, auth_offset, auth_len, #if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, + mflowtype, mflowid, #endif - vrf_id, port)); + vrf_id, port); + if (local_stcb == NULL) { + SCTP_TCB_UNLOCK(stcb); + } + return (local_stcb); } /* * case A in Section 5.2.4 Table 2: XXMM (peer restarted) @@ -2005,11 +1971,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, /* temp code */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 12; - sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, - SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, - SCTP_FROM_SCTP_INPUT + SCTP_LOC_17); - + sctp_stop_association_timers(stcb, false); /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_RESTART; atomic_add_int(&stcb->asoc.refcnt, 1); @@ -2042,6 +2004,10 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, asoc->str_reset_seq_in = asoc->init_seq_number; asoc->advanced_peer_ack_point = asoc->last_acked_seq; asoc->send_sack = 1; + asoc->data_pkts_seen = 0; + asoc->last_data_chunk_from = NULL; + asoc->last_control_chunk_from = NULL; + asoc->last_net_cmt_send_started = NULL; if (asoc->mapping_array) { memset(asoc->mapping_array, 0, asoc->mapping_array_size); @@ -2106,6 +2072,9 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); } + asoc->ctrl_queue_cnt = 0; + asoc->str_reset = NULL; + asoc->stream_reset_outstanding = 0; TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next); if (chk->data) { @@ -2154,11 +2123,17 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, asoc->total_flight = 0; asoc->total_flight_count = 0; /* process the INIT info (peer's info) */ - retval = sctp_process_init(init_cp, stcb); - if (retval < 0) { + if (sctp_process_init(init_cp, stcb) < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 13; - + op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); + SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_init() failed\n"); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + src, dst, sh, op_err, +#if defined(__FreeBSD__) && !defined(__Userspace__) + mflowtype, mflowid, +#endif + vrf_id, net->port); return (NULL); } /* @@ -2167,26 +2142,38 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, */ net->hb_responded = 1; - if (sctp_load_addresses_from_init(stcb, m, - init_offset + sizeof(struct sctp_init_chunk), - initack_offset, src, dst, init_src, stcb->asoc.port)) { + if ((retval = sctp_load_addresses_from_init(stcb, m, + init_offset + sizeof(struct sctp_init_chunk), + initack_offset, src, dst, init_src, stcb->asoc.port)) < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 14; - + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + "Problem with address parameters"); + SCTPDBG(SCTP_DEBUG_INPUT1, + "Load addresses from INIT causes an abort %d\n", + retval); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + src, dst, sh, op_err, +#if defined(__FreeBSD__) && !defined(__Userspace__) + mflowtype, mflowid, +#endif + vrf_id, net->port); return (NULL); } /* respond with a COOKIE-ACK */ - sctp_stop_all_cookie_timers(stcb); - sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 15; - + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE) && + (asoc->sctp_autoclose_ticks > 0)) { + sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); + } return (stcb); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 16; /* all other cases... */ + SCTP_TCB_UNLOCK(stcb); return (NULL); } @@ -2360,8 +2347,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, asoc->advanced_peer_ack_point = asoc->last_acked_seq; /* process the INIT info (peer's info) */ - retval = sctp_process_init(init_cp, stcb); - if (retval < 0) { + if (sctp_process_init(init_cp, stcb) < 0) { #if defined(__APPLE__) && !defined(__Userspace__) atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); @@ -2377,9 +2363,9 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, return (NULL); } /* load all addresses */ - if (sctp_load_addresses_from_init(stcb, m, - init_offset + sizeof(struct sctp_init_chunk), initack_offset, - src, dst, init_src, port)) { + if ((retval = sctp_load_addresses_from_init(stcb, m, + init_offset + sizeof(struct sctp_init_chunk), + initack_offset, src, dst, init_src, port)) < 0) { #if defined(__APPLE__) && !defined(__Userspace__) atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); @@ -2956,12 +2942,15 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, had_a_existing_tcb = 1; *stcb = sctp_process_cookie_existing(m, iphlen, offset, src, dst, sh, - cookie, cookie_len, *inp_p, *stcb, netp, to, - ¬ification, auth_skipped, auth_offset, auth_len, + cookie, cookie_len, *inp_p, *stcb, netp, to, + ¬ification, auth_skipped, auth_offset, auth_len, #if defined(__FreeBSD__) && !defined(__Userspace__) mflowtype, mflowid, #endif vrf_id, port); + if (*stcb == NULL) { + *locked_tcb = NULL; + } } if (*stcb == NULL) { @@ -3847,19 +3836,16 @@ sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chu int len, clen; asoc = &stcb->asoc; - if (TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { - asoc->stream_reset_outstanding = 0; - return (NULL); - } - if (stcb->asoc.str_reset == NULL) { + chk = asoc->str_reset; + if (TAILQ_EMPTY(&asoc->control_send_queue) || + (chk == NULL)) { asoc->stream_reset_outstanding = 0; return (NULL); } - chk = stcb->asoc.str_reset; if (chk->data == NULL) { return (NULL); } - if (bchk) { + if (bchk != NULL) { /* he wants a copy of the chk pointer */ *bchk = chk; } @@ -4798,6 +4784,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, int ret; int abort_no_unlock = 0; int ecne_seen = 0; + int abort_flag; /* * How big should this be, and should it be alloc'd? Lets try the * d-mtu-ceiling for now (2k) and that should hopefully work ... @@ -4962,29 +4949,6 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, } return (NULL); } - } else if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { - if (vtag_in != asoc->my_vtag) { - /* - * this could be a stale SHUTDOWN-ACK or the - * peer never got the SHUTDOWN-COMPLETE and - * is still hung; we have started a new asoc - * but it won't complete until the shutdown - * is completed - */ - if (stcb != NULL) { - SCTP_TCB_UNLOCK(stcb); - } - SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); - op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), - msg); - sctp_handle_ootb(m, iphlen, *offset, src, dst, - sh, inp, op_err, -#if defined(__FreeBSD__) && !defined(__Userspace__) - mflowtype, mflowid, fibnum, -#endif - vrf_id, port); - return (NULL); - } } else { /* for all other chunks, vtag must match */ if (vtag_in != asoc->my_vtag) { @@ -5047,10 +5011,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, chunk_buf); if (ch == NULL) { *offset = length; - if (stcb != NULL) { - SCTP_TCB_UNLOCK(stcb); - } - return (NULL); + return (stcb); } num_chunks++; @@ -5084,12 +5045,12 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, /* The INIT chunk must be the only chunk. */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { - /* RFC 4960 requires that no ABORT is sent */ + /* + * RFC 4960bis requires stopping the + * processing of the packet. + */ *offset = length; - if (stcb != NULL) { - SCTP_TCB_UNLOCK(stcb); - } - return (NULL); + return (stcb); } /* Honor our resource limit. */ if (chk_length > SCTP_LARGEST_INIT_ACCEPTED) { @@ -5296,20 +5257,19 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { SCTP_STAT_INCR(sctps_recvheartbeat); sctp_send_heartbeat_ack(stcb, m, *offset, - chk_length, *netp); + chk_length, *netp); } break; case SCTP_HEARTBEAT_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT_ACK\n"); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_heartbeat_chunk))) { /* Its not ours */ - *offset = length; - return (stcb); + break; } SCTP_STAT_INCR(sctps_recvheartbeatack); if ((netp != NULL) && (*netp != NULL)) { sctp_handle_heartbeat_ack((struct sctp_heartbeat_chunk *)ch, - stcb, *netp); + stcb, *netp); } break; case SCTP_ABORT_ASSOCIATION: @@ -5330,14 +5290,12 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN, stcb %p\n", (void *)stcb); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_shutdown_chunk))) { - *offset = length; - return (stcb); + break; } if ((netp != NULL) && (*netp != NULL)) { - int abort_flag = 0; - + abort_flag = 0; sctp_handle_shutdown((struct sctp_shutdown_chunk *)ch, - stcb, *netp, &abort_flag); + stcb, *netp, &abort_flag); if (abort_flag) { *offset = length; return (NULL); @@ -5346,11 +5304,12 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, break; case SCTP_SHUTDOWN_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN_ACK, stcb %p\n", (void *)stcb); - if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { + if ((chk_length == sizeof(struct sctp_shutdown_ack_chunk)) && + (stcb != NULL) && (netp != NULL) && (*netp != NULL)) { sctp_handle_shutdown_ack((struct sctp_shutdown_ack_chunk *)ch, stcb, *netp); + *offset = length; + return (NULL); } - *offset = length; - return (NULL); break; case SCTP_OPERATION_ERROR: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_OP_ERR\n"); @@ -5494,7 +5453,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, case SCTP_COOKIE_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE_ACK, stcb %p\n", (void *)stcb); if ((stcb == NULL) || chk_length != sizeof(struct sctp_cookie_ack_chunk)) { - return (stcb); + break; } if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ @@ -5524,26 +5483,29 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, break; case SCTP_ECN_ECHO: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN_ECHO\n"); - if ((stcb == NULL) || (chk_length != sizeof(struct sctp_ecne_chunk))) { - /* Its not ours */ - *offset = length; - return (stcb); + if (stcb == NULL) { + break; } if (stcb->asoc.ecn_supported == 0) { goto unknown_chunk; } + if (chk_length != sizeof(struct sctp_ecne_chunk)) { + break; + } sctp_handle_ecn_echo((struct sctp_ecne_chunk *)ch, stcb); ecne_seen = 1; break; case SCTP_ECN_CWR: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN_CWR\n"); - if ((stcb == NULL) || (chk_length != sizeof(struct sctp_cwr_chunk))) { - *offset = length; - return (stcb); + if (stcb == NULL) { + break; } if (stcb->asoc.ecn_supported == 0) { goto unknown_chunk; } + if (chk_length != sizeof(struct sctp_cwr_chunk)) { + break; + } sctp_handle_ecn_cwr((struct sctp_cwr_chunk *)ch, stcb, *netp); break; case SCTP_SHUTDOWN_COMPLETE: @@ -5554,12 +5516,13 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, *offset = length; return (stcb); } - if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { + if ((chk_length == sizeof(struct sctp_shutdown_complete_chunk)) && + (stcb != NULL) && (netp != NULL) && (*netp != NULL)) { sctp_handle_shutdown_complete((struct sctp_shutdown_complete_chunk *)ch, - stcb, *netp); + stcb, *netp); + *offset = length; + return (NULL); } - *offset = length; - return (NULL); break; case SCTP_ASCONF: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n"); @@ -5568,32 +5531,33 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, goto unknown_chunk; } sctp_handle_asconf(m, *offset, src, - (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0); + (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0); asconf_cnt++; } break; case SCTP_ASCONF_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF_ACK\n"); + if (stcb == NULL) { + break; + } + if (stcb->asoc.asconf_supported == 0) { + goto unknown_chunk; + } if (chk_length < sizeof(struct sctp_asconf_ack_chunk)) { - /* Its not ours */ - *offset = length; - return (stcb); + break; } - if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { - if (stcb->asoc.asconf_supported == 0) { - goto unknown_chunk; - } + if ((netp != NULL) && (*netp != NULL)) { /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, - stcb->asoc.overall_error_count, - 0, - SCTP_FROM_SCTP_INPUT, - __LINE__); + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_asconf_ack(m, *offset, - (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock); + (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock); if (abort_no_unlock) return (NULL); } @@ -5602,72 +5566,70 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, case SCTP_IFORWARD_CUM_TSN: SCTPDBG(SCTP_DEBUG_INPUT3, "%s\n", ch->chunk_type == SCTP_FORWARD_CUM_TSN ? "FORWARD_TSN" : "I_FORWARD_TSN"); + if (stcb == NULL) { + break; + } + if (stcb->asoc.prsctp_supported == 0) { + goto unknown_chunk; + } if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) { - /* Its not ours */ - *offset = length; - return (stcb); + break; } - - if (stcb != NULL) { - int abort_flag = 0; - - if (stcb->asoc.prsctp_supported == 0) { - goto unknown_chunk; - } - if (((stcb->asoc.idata_supported == 1) && (ch->chunk_type == SCTP_FORWARD_CUM_TSN)) || - ((stcb->asoc.idata_supported == 0) && (ch->chunk_type == SCTP_IFORWARD_CUM_TSN))) { - if (ch->chunk_type == SCTP_FORWARD_CUM_TSN) { - SCTP_SNPRINTF(msg, sizeof(msg), "%s", "FORWARD-TSN chunk received when I-FORWARD-TSN was negotiated"); - } else { - SCTP_SNPRINTF(msg, sizeof(msg), "%s", "I-FORWARD-TSN chunk received when FORWARD-TSN was negotiated"); - } - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); - *offset = length; - return (NULL); + if (((stcb->asoc.idata_supported == 1) && (ch->chunk_type == SCTP_FORWARD_CUM_TSN)) || + ((stcb->asoc.idata_supported == 0) && (ch->chunk_type == SCTP_IFORWARD_CUM_TSN))) { + if (ch->chunk_type == SCTP_FORWARD_CUM_TSN) { + SCTP_SNPRINTF(msg, sizeof(msg), "%s", "FORWARD-TSN chunk received when I-FORWARD-TSN was negotiated"); + } else { + SCTP_SNPRINTF(msg, sizeof(msg), "%s", "I-FORWARD-TSN chunk received when FORWARD-TSN was negotiated"); } - *fwd_tsn_seen = 1; - if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { - /* We are not interested anymore */ + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); + *offset = length; + return (NULL); + } + *fwd_tsn_seen = 1; + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* We are not interested anymore */ #if defined(__APPLE__) && !defined(__Userspace__) - so = SCTP_INP_SO(inp); - atomic_add_int(&stcb->asoc.refcnt, 1); - SCTP_TCB_UNLOCK(stcb); - SCTP_SOCKET_LOCK(so, 1); - SCTP_TCB_LOCK(stcb); - atomic_subtract_int(&stcb->asoc.refcnt, 1); + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, - SCTP_FROM_SCTP_INPUT + SCTP_LOC_31); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_31); #if defined(__APPLE__) && !defined(__Userspace__) - SCTP_SOCKET_UNLOCK(so, 1); + SCTP_SOCKET_UNLOCK(so, 1); #endif - *offset = length; - return (NULL); - } - /* - * For sending a SACK this looks like DATA - * chunks. - */ - stcb->asoc.last_data_chunk_from = stcb->asoc.last_control_chunk_from; - sctp_handle_forward_tsn(stcb, - (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset); - if (abort_flag) { - *offset = length; - return (NULL); - } + *offset = length; + return (NULL); + } + /* + * For sending a SACK this looks like DATA + * chunks. + */ + stcb->asoc.last_data_chunk_from = stcb->asoc.last_control_chunk_from; + abort_flag = 0; + sctp_handle_forward_tsn(stcb, + (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset); + if (abort_flag) { + *offset = length; + return (NULL); } break; case SCTP_STREAM_RESET: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_STREAM_RESET\n"); - if ((stcb == NULL) || (chk_length < sizeof(struct sctp_stream_reset_tsn_req))) { - /* Its not ours */ - *offset = length; - return (stcb); + if (stcb == NULL) { + break; } if (stcb->asoc.reconfig_supported == 0) { goto unknown_chunk; } + if (chk_length < sizeof(struct sctp_stream_reset_tsn_req)) { + break; + } if (sctp_handle_stream_reset(stcb, m, *offset, ch)) { /* stop processing */ *offset = length; @@ -5676,20 +5638,19 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, break; case SCTP_PACKET_DROPPED: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_PACKET_DROPPED\n"); - /* re-get it all please */ + if (stcb == NULL) { + break; + } + if (stcb->asoc.pktdrop_supported == 0) { + goto unknown_chunk; + } if (chk_length < sizeof(struct sctp_pktdrop_chunk)) { - /* Its not ours */ - *offset = length; - return (stcb); + break; } - - if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { - if (stcb->asoc.pktdrop_supported == 0) { - goto unknown_chunk; - } + if ((netp != NULL) && (*netp != NULL)) { sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch, - stcb, *netp, - min(chk_length, contiguous)); + stcb, *netp, + min(chk_length, contiguous)); } break; case SCTP_AUTHENTICATION: @@ -5702,21 +5663,21 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, auth_skipped = 1; } /* skip this chunk (temporarily) */ - goto next_chunk; + break; } if (stcb->asoc.auth_supported == 0) { goto unknown_chunk; } if ((chk_length < (sizeof(struct sctp_auth_chunk))) || (chk_length > (sizeof(struct sctp_auth_chunk) + - SCTP_AUTH_DIGEST_LEN_MAX))) { + SCTP_AUTH_DIGEST_LEN_MAX))) { /* Its not ours */ *offset = length; return (stcb); } if (got_auth == 1) { /* skip this chunk... it's already auth'd */ - goto next_chunk; + break; } got_auth = 1; if (sctp_handle_auth(stcb, (struct sctp_auth_chunk *)ch, m, *offset)) { @@ -5777,7 +5738,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, break; } ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, - sizeof(struct sctp_chunkhdr), chunk_buf); + sizeof(struct sctp_chunkhdr), chunk_buf); if (ch == NULL) { *offset = length; return (stcb); diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_os_userspace.h b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_os_userspace.h index 6c3348ad9f..46b618110c 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_os_userspace.h +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_os_userspace.h @@ -886,7 +886,7 @@ int sctp_userspace_get_mtu_from_ifn(uint32_t if_index, int af); #define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((rt != NULL) ? rt->rt_rmx.rmx_mtu : 0) -#define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) sctp_userspace_get_mtu_from_ifn(if_nametoindex(((struct ifaddrs *) (sctp_ifn))->ifa_name), AF_INET) +#define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) (sctp_ifn->ifn_mtu) #define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \ if (rt != NULL) \ diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_pcb.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_pcb.c index d30019b22a..d1e84daad5 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_pcb.c +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_pcb.c @@ -7533,7 +7533,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, break; } phdr = sctp_get_next_param(m, offset, ¶m_buf, - sizeof(param_buf)); + sizeof(param_buf)); } /* Now check to see if we need to purge any addresses */ TAILQ_FOREACH_SAFE(net, &stcb->asoc.nets, sctp_next, nnet) { @@ -7543,11 +7543,15 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, /* remove and free it */ stcb->asoc.numnets--; TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next); - sctp_free_remote_addr(net); + if (net == stcb->asoc.alternate) { + sctp_free_remote_addr(stcb->asoc.alternate); + stcb->asoc.alternate = NULL; + } if (net == stcb->asoc.primary_destination) { stcb->asoc.primary_destination = NULL; sctp_select_primary_destination(stcb); } + sctp_free_remote_addr(net); } } if ((stcb->asoc.ecn_supported == 1) && diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.c index db0e7533ff..8472c3a1c0 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.c +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.c @@ -80,7 +80,25 @@ sctp_sha1_final(unsigned char *digest, struct sctp_sha1_context *ctx) { SHA1_Final(digest, &ctx->sha_ctx); } +#elif defined(SCTP_USE_MBEDTLS_SHA1) +void +sctp_sha1_init(struct sctp_sha1_context *ctx) +{ + mbedtls_sha1_init(&ctx->sha1_ctx); + mbedtls_sha1_starts_ret(&ctx->sha1_ctx); +} +void +sctp_sha1_update(struct sctp_sha1_context *ctx, const unsigned char *ptr, unsigned int siz) +{ + mbedtls_sha1_update_ret(&ctx->sha1_ctx, ptr, siz); +} + +void +sctp_sha1_final(unsigned char *digest, struct sctp_sha1_context *ctx) +{ + mbedtls_sha1_finish_ret(&ctx->sha1_ctx, digest); +} #else #include <string.h> diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.h b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.h index d535ee4639..9ff4ff7bdc 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.h +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_sha1.h @@ -46,6 +46,8 @@ __FBSDID("$FreeBSD$"); #include <pk11pub.h> #elif defined(SCTP_USE_OPENSSL_SHA1) #include <openssl/sha.h> +#elif defined(SCTP_USE_MBEDTLS_SHA1) +#include <mbedtls/sha1.h> #endif struct sctp_sha1_context { @@ -53,6 +55,8 @@ struct sctp_sha1_context { struct PK11Context *pk11_ctx; #elif defined(SCTP_USE_OPENSSL_SHA1) SHA_CTX sha_ctx; +#elif defined(SCTP_USE_MBEDTLS_SHA1) + mbedtls_sha1_context sha1_ctx; #else unsigned int A; unsigned int B; diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_userspace.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_userspace.c index ba64aaff77..41aff19e08 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_userspace.c +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_userspace.c @@ -98,23 +98,42 @@ sctp_userspace_set_threadname(const char *name) int sctp_userspace_get_mtu_from_ifn(uint32_t if_index, int af) { +#if defined(INET) || defined(INET6) struct ifreq ifr; int fd; +#endif + int mtu; - memset(&ifr, 0, sizeof(struct ifreq)); - if (if_indextoname(if_index, ifr.ifr_name) != NULL) { - /* TODO can I use the raw socket here and not have to open a new one with each query? */ - if ((fd = socket(af, SOCK_DGRAM, 0)) < 0) - return (0); - if (ioctl(fd, SIOCGIFMTU, &ifr) < 0) { + switch (af) { +#if defined(INET) + case AF_INET: +#endif +#if defined(INET6) + case AF_INET6: +#endif +#if defined(INET) || defined(INET6) + memset(&ifr, 0, sizeof(struct ifreq)); + mtu = 0; + if (if_indextoname(if_index, ifr.ifr_name) != NULL) { + /* TODO can I use the raw socket here and not have to open a new one with each query? */ + if ((fd = socket(af, SOCK_DGRAM, 0)) < 0) { + break; + } + if (ioctl(fd, SIOCGIFMTU, &ifr) >= 0) { + mtu = ifr.ifr_mtu; + } close(fd); - return (0); } - close(fd); - return ifr.ifr_mtu; - } else { - return (0); + break; +#endif + case AF_CONN: + mtu = 1280; + break; + default: + mtu = 0; + break; } + return (mtu); } #endif @@ -143,41 +162,60 @@ timingsafe_bcmp(const void *b1, const void *b2, size_t n) int sctp_userspace_get_mtu_from_ifn(uint32_t if_index, int af) { +#if defined(INET) || defined(INET6) PIP_ADAPTER_ADDRESSES pAdapterAddrs, pAdapt; DWORD AdapterAddrsSize, Err; - int ret; +#endif + int mtu; - ret = 0; - AdapterAddrsSize = 0; - pAdapterAddrs = NULL; - if ((Err = GetAdaptersAddresses(AF_UNSPEC, 0, NULL, NULL, &AdapterAddrsSize)) != 0) { - if ((Err != ERROR_BUFFER_OVERFLOW) && (Err != ERROR_INSUFFICIENT_BUFFER)) { - SCTPDBG(SCTP_DEBUG_USR, "GetAdaptersAddresses() sizing failed with error code %d, AdapterAddrsSize = %d\n", Err, AdapterAddrsSize); - ret = -1; + switch (af) { +#if defined(INET) + case AF_INET: +#endif +#if defined(INET6) + case AF_INET6: +#endif +#if defined(INET) || defined(INET6) + mtu = 0; + AdapterAddrsSize = 0; + pAdapterAddrs = NULL; + if ((Err = GetAdaptersAddresses(AF_UNSPEC, 0, NULL, NULL, &AdapterAddrsSize)) != 0) { + if ((Err != ERROR_BUFFER_OVERFLOW) && (Err != ERROR_INSUFFICIENT_BUFFER)) { + SCTPDBG(SCTP_DEBUG_USR, "GetAdaptersAddresses() sizing failed with error code %d, AdapterAddrsSize = %d\n", Err, AdapterAddrsSize); + mtu = -1; + goto cleanup; + } + } + if ((pAdapterAddrs = (PIP_ADAPTER_ADDRESSES) GlobalAlloc(GPTR, AdapterAddrsSize)) == NULL) { + SCTPDBG(SCTP_DEBUG_USR, "Memory allocation error!\n"); + mtu = -1; goto cleanup; } - } - if ((pAdapterAddrs = (PIP_ADAPTER_ADDRESSES) GlobalAlloc(GPTR, AdapterAddrsSize)) == NULL) { - SCTPDBG(SCTP_DEBUG_USR, "Memory allocation error!\n"); - ret = -1; - goto cleanup; - } - if ((Err = GetAdaptersAddresses(AF_UNSPEC, 0, NULL, pAdapterAddrs, &AdapterAddrsSize)) != ERROR_SUCCESS) { - SCTPDBG(SCTP_DEBUG_USR, "GetAdaptersAddresses() failed with error code %d\n", Err); - ret = -1; - goto cleanup; - } - for (pAdapt = pAdapterAddrs; pAdapt; pAdapt = pAdapt->Next) { - if (pAdapt->IfIndex == if_index) { - ret = pAdapt->Mtu; - break; + if ((Err = GetAdaptersAddresses(AF_UNSPEC, 0, NULL, pAdapterAddrs, &AdapterAddrsSize)) != ERROR_SUCCESS) { + SCTPDBG(SCTP_DEBUG_USR, "GetAdaptersAddresses() failed with error code %d\n", Err); + mtu = -1; + goto cleanup; } + for (pAdapt = pAdapterAddrs; pAdapt; pAdapt = pAdapt->Next) { + if (pAdapt->IfIndex == if_index) { + mtu = pAdapt->Mtu; + break; + } + } + cleanup: + if (pAdapterAddrs != NULL) { + GlobalFree(pAdapterAddrs); + } + break; +#endif + case AF_CONN: + mtu = 1280; + break; + default: + mtu = 0; + break; } -cleanup: - if (pAdapterAddrs != NULL) { - GlobalFree(pAdapterAddrs); - } - return (ret); + return (mtu); } void diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_usrreq.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_usrreq.c index e5fba96717..e8cf78017a 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_usrreq.c +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctp_usrreq.c @@ -34,7 +34,7 @@ #if defined(__FreeBSD__) && !defined(__Userspace__) #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/netinet/sctp_usrreq.c 366750 2020-10-16 10:44:48Z tuexen $"); +__FBSDID("$FreeBSD$"); #endif #include <netinet/sctp_os.h> @@ -974,29 +974,29 @@ sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) || (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE))) { goto connected_type; - } else if (addr == NULL) { + } + + error = 0; + if (addr == NULL) { SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ); error = EDESTADDRREQ; - sctp_m_freem(m); - if (control) { - sctp_m_freem(control); - control = NULL; - } - return (error); + } else if (addr->sa_family != AF_INET) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EAFNOSUPPORT); + error = EAFNOSUPPORT; +#if defined(HAVE_SA_LEN) + } else if (addr->sa_len != sizeof(struct sockaddr_in)) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; +#endif } -#ifdef INET6 - if (addr->sa_family != AF_INET) { - /* must be a v4 address! */ - SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ); + if (error != 0) { sctp_m_freem(m); if (control) { sctp_m_freem(control); control = NULL; } - error = EDESTADDRREQ; return (error); } -#endif /* INET6 */ connected_type: /* now what about control */ if (control) { @@ -6112,6 +6112,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, return (EINVAL); } if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && + (paddrp->spp_pathmtu > 0) && ((paddrp->spp_pathmtu < SCTP_SMALLEST_PMTU) || (paddrp->spp_pathmtu > SCTP_LARGEST_PMTU))) { if (stcb) @@ -6156,28 +6157,30 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11); } net->dest_state |= SCTP_ADDR_NO_PMTUD; - net->mtu = paddrp->spp_pathmtu; - switch (net->ro._l_addr.sa.sa_family) { + if (paddrp->spp_pathmtu > 0) { + net->mtu = paddrp->spp_pathmtu; + switch (net->ro._l_addr.sa.sa_family) { #ifdef INET - case AF_INET: - net->mtu += SCTP_MIN_V4_OVERHEAD; - break; + case AF_INET: + net->mtu += SCTP_MIN_V4_OVERHEAD; + break; #endif #ifdef INET6 - case AF_INET6: - net->mtu += SCTP_MIN_OVERHEAD; - break; + case AF_INET6: + net->mtu += SCTP_MIN_OVERHEAD; + break; #endif #if defined(__Userspace__) - case AF_CONN: - net->mtu += sizeof(struct sctphdr); - break; + case AF_CONN: + net->mtu += sizeof(struct sctphdr); + break; #endif - default: - break; - } - if (net->mtu < stcb->asoc.smallest_mtu) { - sctp_pathmtu_adjustment(stcb, net->mtu); + default: + break; + } + if (net->mtu < stcb->asoc.smallest_mtu) { + sctp_pathmtu_adjustment(stcb, net->mtu); + } } } if (paddrp->spp_flags & SPP_PMTUD_ENABLE) { @@ -6186,7 +6189,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } net->dest_state &= ~SCTP_ADDR_NO_PMTUD; } - if (paddrp->spp_pathmaxrxt) { + if (paddrp->spp_pathmaxrxt > 0) { if (net->dest_state & SCTP_ADDR_PF) { if (net->error_count > paddrp->spp_pathmaxrxt) { net->dest_state &= ~SCTP_ADDR_PF; @@ -6229,7 +6232,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, #endif } else { /************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/ - if (paddrp->spp_pathmaxrxt != 0) { + if (paddrp->spp_pathmaxrxt > 0) { stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->dest_state & SCTP_ADDR_PF) { @@ -6261,7 +6264,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, net->failure_threshold = paddrp->spp_pathmaxrxt; } } - if (paddrp->spp_flags & SPP_HB_ENABLE) { if (paddrp->spp_hbinterval != 0) { stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval; @@ -6304,31 +6306,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_16); } net->dest_state |= SCTP_ADDR_NO_PMTUD; - net->mtu = paddrp->spp_pathmtu; - switch (net->ro._l_addr.sa.sa_family) { + if (paddrp->spp_pathmtu > 0) { + net->mtu = paddrp->spp_pathmtu; + switch (net->ro._l_addr.sa.sa_family) { #ifdef INET - case AF_INET: - net->mtu += SCTP_MIN_V4_OVERHEAD; - break; + case AF_INET: + net->mtu += SCTP_MIN_V4_OVERHEAD; + break; #endif #ifdef INET6 - case AF_INET6: - net->mtu += SCTP_MIN_OVERHEAD; - break; + case AF_INET6: + net->mtu += SCTP_MIN_OVERHEAD; + break; #endif #if defined(__Userspace__) - case AF_CONN: - net->mtu += sizeof(struct sctphdr); - break; + case AF_CONN: + net->mtu += sizeof(struct sctphdr); + break; #endif - default: - break; - } - if (net->mtu < stcb->asoc.smallest_mtu) { - sctp_pathmtu_adjustment(stcb, net->mtu); + default: + break; + } + if (net->mtu < stcb->asoc.smallest_mtu) { + sctp_pathmtu_adjustment(stcb, net->mtu); + } } } - stcb->asoc.default_mtu = paddrp->spp_pathmtu; + if (paddrp->spp_pathmtu > 0) { + stcb->asoc.default_mtu = paddrp->spp_pathmtu; + } sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD); } if (paddrp->spp_flags & SPP_PMTUD_ENABLE) { @@ -6374,7 +6380,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, * For the TOS/FLOWLABEL stuff you set it * with the options on the socket */ - if (paddrp->spp_pathmaxrxt != 0) { + if (paddrp->spp_pathmaxrxt > 0) { inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; } @@ -6400,7 +6406,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, inp->sctp_ep.default_mtu = 0; sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD); } else if (paddrp->spp_flags & SPP_PMTUD_DISABLE) { - inp->sctp_ep.default_mtu = paddrp->spp_pathmtu; + if (paddrp->spp_pathmtu > 0) { + inp->sctp_ep.default_mtu = paddrp->spp_pathmtu; + } sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD); } if (paddrp->spp_flags & SPP_DSCP) { diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctputil.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctputil.c index 79838e40da..639b36f307 100755 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctputil.c +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet/sctputil.c @@ -4837,7 +4837,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, * if there is return 1, else return 0. */ int -sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t * vtagfill) +sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t *vtag) { struct sctp_chunkhdr *ch; struct sctp_init_chunk *init_chk, chunk_buf; @@ -4858,12 +4858,13 @@ sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t * vtagfill) /* yep, tell them */ return (1); } - if (ch->chunk_type == SCTP_INITIATION) { + if ((ch->chunk_type == SCTP_INITIATION) || + (ch->chunk_type == SCTP_INITIATION_ACK)) { /* need to update the Vtag */ init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m, - offset, sizeof(*init_chk), (uint8_t *) & chunk_buf); + offset, sizeof(struct sctp_init_chunk), (uint8_t *) & chunk_buf); if (init_chk != NULL) { - *vtagfill = ntohl(init_chk->init.initiate_tag); + *vtag = ntohl(init_chk->init.initiate_tag); } } /* Nope, move to the next chunk */ diff --git a/third_party/usrsctp/usrsctplib/usrsctplib/netinet6/sctp6_usrreq.c b/third_party/usrsctp/usrsctplib/usrsctplib/netinet6/sctp6_usrreq.c index 5a931dd5a2..aa0c0051a5 100644 --- a/third_party/usrsctp/usrsctplib/usrsctplib/netinet6/sctp6_usrreq.c +++ b/third_party/usrsctp/usrsctplib/usrsctplib/netinet6/sctp6_usrreq.c @@ -34,7 +34,7 @@ #if defined(__FreeBSD__) && !defined(__Userspace__) #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/netinet6/sctp6_usrreq.c 365071 2020-09-01 21:19:14Z mjg $"); +__FBSDID("$FreeBSD$"); #endif #include <netinet/sctp_os.h> @@ -259,13 +259,14 @@ sctp6_input(struct mbuf **i_pak, int *offp, int proto) if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { goto out; } - ecn_bits = ((ntohl(ip6->ip6_flow) >> 20) & 0x000000ff); #if defined(__FreeBSD__) + ecn_bits = IPV6_TRAFFIC_CLASS(ip6); if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; } else { #else + ecn_bits = ((ntohl(ip6->ip6_flow) >> 20) & 0x000000ff); if (SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) && (IN6_ARE_ADDR_EQUAL(&src.sin6_addr, &dst.sin6_addr))) { SCTP_STAT_INCR(sctps_recvhwcrc); @@ -654,9 +655,10 @@ out: return (error); } -SYSCTL_PROC(_net_inet6_sctp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW, - 0, 0, - sctp6_getcred, "S,ucred", "Get the ucred of a SCTP6 connection"); +SYSCTL_PROC(_net_inet6_sctp6, OID_AUTO, getcred, + CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_NEEDGIANT, + 0, 0, sctp6_getcred, "S,ucred", + "Get the ucred of a SCTP6 connection"); #endif /* This is the same as the sctp_abort() could be made common */ @@ -1007,6 +1009,46 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *nam, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EDESTADDRREQ); return (EDESTADDRREQ); } + switch (addr->sa_family) { +#ifdef INET + case AF_INET: +#if defined(HAVE_SA_LEN) + if (addr->sa_len != sizeof(struct sockaddr_in)) { + if (control) { + SCTP_RELEASE_PKT(control); + control = NULL; + } + SCTP_RELEASE_PKT(m); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); + return (EINVAL); + } +#endif + break; +#endif +#ifdef INET6 + case AF_INET6: +#if defined(HAVE_SA_LEN) + if (addr->sa_len != sizeof(struct sockaddr_in6)) { + if (control) { + SCTP_RELEASE_PKT(control); + control = NULL; + } + SCTP_RELEASE_PKT(m); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); + return (EINVAL); + } +#endif + break; +#endif + default: + if (control) { + SCTP_RELEASE_PKT(control); + control = NULL; + } + SCTP_RELEASE_PKT(m); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); + return (EINVAL); + } #ifdef INET sin6 = (struct sockaddr_in6 *)addr; if (SCTP_IPV6_V6ONLY(inp)) { @@ -1015,10 +1057,20 @@ sctp6_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *nam, * v4 addr or v4-mapped addr */ if (addr->sa_family == AF_INET) { + if (control) { + SCTP_RELEASE_PKT(control); + control = NULL; + } + SCTP_RELEASE_PKT(m); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + if (control) { + SCTP_RELEASE_PKT(control); + control = NULL; + } + SCTP_RELEASE_PKT(m); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL); return (EINVAL); } |