diff options
-rw-r--r-- | lib/include/ultrahdr/gainmapmath.h | 21 | ||||
-rw-r--r-- | lib/src/dsp/arm/gainmapmath_neon.cpp | 113 | ||||
-rw-r--r-- | tests/gainmapmath_test.cpp | 147 |
3 files changed, 281 insertions, 0 deletions
diff --git a/lib/include/ultrahdr/gainmapmath.h b/lib/include/ultrahdr/gainmapmath.h index 48aea05..686073a 100644 --- a/lib/include/ultrahdr/gainmapmath.h +++ b/lib/include/ultrahdr/gainmapmath.h @@ -26,6 +26,10 @@ #include "ultrahdr/ultrahdr.h" #include "ultrahdr/jpegr.h" +#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) +#include <arm_neon.h> +#endif + #define CLIP3(x, min, max) ((x) < (min)) ? (min) : ((x) > (max)) ? (max) : (x) namespace ultrahdr { @@ -441,6 +445,23 @@ extern const std::array<float, 9> kYuvBt2100ToBt601; Color yuvColorGamutConversion(Color e_gamma, const std::array<float, 9>& coeffs); +#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) + +extern const int16_t kYuv709To601_coeffs_neon[8]; +extern const int16_t kYuv709To2100_coeffs_neon[8]; +extern const int16_t kYuv601To709_coeffs_neon[8]; +extern const int16_t kYuv601To2100_coeffs_neon[8]; +extern const int16_t kYuv2100To709_coeffs_neon[8]; +extern const int16_t kYuv2100To601_coeffs_neon[8]; + +/* + * The Y values are provided at half the width of U & V values to allow use of the widening + * arithmetic instructions. + */ +int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs); + +#endif + /* * Performs a color gamut transformation on an entire YUV420 image. * diff --git a/lib/src/dsp/arm/gainmapmath_neon.cpp b/lib/src/dsp/arm/gainmapmath_neon.cpp new file mode 100644 index 0000000..6536045 --- /dev/null +++ b/lib/src/dsp/arm/gainmapmath_neon.cpp @@ -0,0 +1,113 @@ +/* + * Copyright 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ultrahdr/gainmapmath.h" + +#include <arm_neon.h> + +namespace ultrahdr { + +// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off +// by one error compared to the scalar floating-point implementation. + +// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients. +// Pack them into a single 128-bit vector as follows, zeroing the remaining elements: +// {Y1, Y2, U1, U2, V1, V2, 0, 0} + +// Yuv Bt709 -> Yuv Bt601 +// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V) +// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V) +// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V) +__attribute__((aligned(16))) +const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0}; + +// Yuv Bt709 -> Yuv Bt2100 +// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V) +// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V) +// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V) +__attribute__((aligned(16))) +const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0}; + +// Yuv Bt601 -> Yuv Bt709 +// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V), +// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V), +// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V); +__attribute__((aligned(16))) +const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0}; + +// Yuv Bt601 -> Yuv Bt2100 +// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V) +// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V) +// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V) +__attribute__((aligned(16))) +const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0}; + +// Yuv Bt2100 -> Yuv Bt709 +// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V) +// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V) +// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V) +__attribute__((aligned(16))) +const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0}; + +// Yuv Bt2100 -> Yuv Bt601 +// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V) +// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V) +// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V) +__attribute__((aligned(16))) +const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0}; + +static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) { + int32x4_t lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 0); + int32x4_t hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 0); + lo = vmlal_laneq_s16(lo, vget_low_s16(v), coeffs, 1); + hi = vmlal_laneq_s16(hi, vget_high_s16(v), coeffs, 1); + + // Descale result to account for coefficients being scaled by 2^14. + uint16x8_t y_output = + vreinterpretq_u16_s16(vcombine_s16(vqrshrn_n_s32(lo, 14), vqrshrn_n_s32(hi, 14))); + return vreinterpretq_s16_u16(vaddw_u8(y_output, y)); +} + +static inline int16x8_t uConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) { + int32x4_t u_lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 2); + int32x4_t u_hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 2); + u_lo = vmlal_laneq_s16(u_lo, vget_low_s16(v), coeffs, 3); + u_hi = vmlal_laneq_s16(u_hi, vget_high_s16(v), coeffs, 3); + + // Descale result to account for coefficients being scaled by 2^14. + const int16x8_t u_output = vcombine_s16(vqrshrn_n_s32(u_lo, 14), vqrshrn_n_s32(u_hi, 14)); + return u_output; +} + +static inline int16x8_t vConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) { + int32x4_t v_lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 4); + int32x4_t v_hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 4); + v_lo = vmlal_laneq_s16(v_lo, vget_low_s16(v), coeffs, 5); + v_hi = vmlal_laneq_s16(v_hi, vget_high_s16(v), coeffs, 5); + + // Descale result to account for coefficients being scaled by 2^14. + const int16x8_t v_output = vcombine_s16(vqrshrn_n_s32(v_lo, 14), vqrshrn_n_s32(v_hi, 14)); + return v_output; +} + +int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) { + const int16x8_t y_output = yConversion_neon(y, u, v, coeffs); + const int16x8_t u_output = uConversion_neon(u, v, coeffs); + const int16x8_t v_output = vConversion_neon(u, v, coeffs); + return {y_output, u_output, v_output}; +} + +} // namespace ultrahdr diff --git a/tests/gainmapmath_test.cpp b/tests/gainmapmath_test.cpp index a602801..a1d61ce 100644 --- a/tests/gainmapmath_test.cpp +++ b/tests/gainmapmath_test.cpp @@ -97,6 +97,31 @@ class GainMapMathTest : public testing::Test { Color Bt2100YuvGreen() { return {{{0.6780f, -0.36037f, -0.45979f}}}; } Color Bt2100YuvBlue() { return {{{0.0593f, 0.5f, -0.04021f}}}; } + ////////////////////////////////////////////////////////////////////////////// + // Reference values for when using fixed-point arithmetic. + + Pixel RgbBlackPixel() { return {0, 0, 0}; } + Pixel RgbWhitePixel() { return {255, 255, 255}; } + + Pixel RgbRedPixel() { return {255, 0, 0}; } + Pixel RgbGreenPixel() { return {0, 255, 0}; } + Pixel RgbBluePixel() { return {0, 0, 255}; } + + Pixel YuvBlackPixel() { return {0, 0, 0}; } + Pixel YuvWhitePixel() { return {255, 0, 0}; } + + Pixel SrgbYuvRedPixel() { return {54, -29, 128}; } + Pixel SrgbYuvGreenPixel() { return {182, -98, -116}; } + Pixel SrgbYuvBluePixel() { return {18, 128, -12}; } + + Pixel P3YuvRedPixel() { return {76, -43, 128}; } + Pixel P3YuvGreenPixel() { return {150, -84, -107}; } + Pixel P3YuvBluePixel() { return {29, 128, -21}; } + + Pixel Bt2100YuvRedPixel() { return {67, -36, 128}; } + Pixel Bt2100YuvGreenPixel() { return {173, -92, -117}; } + Pixel Bt2100YuvBluePixel() { return {15, 128, -10}; } + float SrgbYuvToLuminance(Color yuv_gamma, ColorCalculationFn luminanceFn) { Color rgb_gamma = srgbYuvToRgb(yuv_gamma); Color rgb = srgbInvOetf(rgb_gamma); @@ -655,6 +680,128 @@ TEST_F(GainMapMathTest, YuvColorGamutConversion) { } } +#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) +TEST_F(GainMapMathTest, YuvConversionNeon) { + const std::array<Pixel, 5> SrgbYuvColors{YuvBlackPixel(), YuvWhitePixel(), SrgbYuvRedPixel(), + SrgbYuvGreenPixel(), SrgbYuvBluePixel()}; + + const std::array<Pixel, 5> P3YuvColors{YuvBlackPixel(), YuvWhitePixel(), P3YuvRedPixel(), + P3YuvGreenPixel(), P3YuvBluePixel()}; + + const std::array<Pixel, 5> Bt2100YuvColors{YuvBlackPixel(), YuvWhitePixel(), Bt2100YuvRedPixel(), + Bt2100YuvGreenPixel(), Bt2100YuvBluePixel()}; + + struct InputSamples { + std::array<uint8_t, 8> y; + std::array<int16_t, 8> u; + std::array<int16_t, 8> v; + }; + + struct ExpectedSamples { + std::array<int16_t, 8> y; + std::array<int16_t, 8> u; + std::array<int16_t, 8> v; + }; + + // Each tuple contains three elements. + // 0. A pointer to the coefficients that will be passed to the Neon implementation + // 1. Input pixel/color array + // 2. The expected results + const std::array< + std::tuple<const int16_t*, const std::array<Pixel, 5>, const std::array<Pixel, 5>>, 6> + coeffs_setup_correct{{ + {kYuv709To601_coeffs_neon, SrgbYuvColors, P3YuvColors}, + {kYuv709To2100_coeffs_neon, SrgbYuvColors, Bt2100YuvColors}, + {kYuv601To709_coeffs_neon, P3YuvColors, SrgbYuvColors}, + {kYuv601To2100_coeffs_neon, P3YuvColors, Bt2100YuvColors}, + {kYuv2100To709_coeffs_neon, Bt2100YuvColors, SrgbYuvColors}, + {kYuv2100To601_coeffs_neon, Bt2100YuvColors, P3YuvColors}, + }}; + + for (const auto& [coeff_ptr, input, expected] : coeffs_setup_correct) { + const int16x8_t coeffs = vld1q_s16(coeff_ptr); + InputSamples input_values; + ExpectedSamples expected_values; + for (size_t sample_idx = 0; sample_idx < 8; ++sample_idx) { + size_t ring_idx = sample_idx % input.size(); + input_values.y.at(sample_idx) = static_cast<uint8_t>(input.at(ring_idx).y); + input_values.u.at(sample_idx) = input.at(ring_idx).u; + input_values.v.at(sample_idx) = input.at(ring_idx).v; + + expected_values.y.at(sample_idx) = expected.at(ring_idx).y; + expected_values.u.at(sample_idx) = expected.at(ring_idx).u; + expected_values.v.at(sample_idx) = expected.at(ring_idx).v; + } + + const uint8x8_t y_neon = vld1_u8(input_values.y.data()); + const int16x8_t u_neon = vld1q_s16(input_values.u.data()); + const int16x8_t v_neon = vld1q_s16(input_values.v.data()); + + const int16x8x3_t neon_result = yuvConversion_neon(y_neon, u_neon, v_neon, coeffs); + + const int16x8_t y_neon_result = neon_result.val[0]; + const int16x8_t u_neon_result = neon_result.val[1]; + const int16x8_t v_neon_result = neon_result.val[2]; + + const Pixel result0 = {vgetq_lane_s16(y_neon_result, 0), vgetq_lane_s16(u_neon_result, 0), + vgetq_lane_s16(v_neon_result, 0)}; + + const Pixel result1 = {vgetq_lane_s16(y_neon_result, 1), vgetq_lane_s16(u_neon_result, 1), + vgetq_lane_s16(v_neon_result, 1)}; + + const Pixel result2 = {vgetq_lane_s16(y_neon_result, 2), vgetq_lane_s16(u_neon_result, 2), + vgetq_lane_s16(v_neon_result, 2)}; + + const Pixel result3 = {vgetq_lane_s16(y_neon_result, 3), vgetq_lane_s16(u_neon_result, 3), + vgetq_lane_s16(v_neon_result, 3)}; + + const Pixel result4 = {vgetq_lane_s16(y_neon_result, 4), vgetq_lane_s16(u_neon_result, 4), + vgetq_lane_s16(v_neon_result, 4)}; + + const Pixel result5 = {vgetq_lane_s16(y_neon_result, 5), vgetq_lane_s16(u_neon_result, 5), + vgetq_lane_s16(v_neon_result, 5)}; + + const Pixel result6 = {vgetq_lane_s16(y_neon_result, 6), vgetq_lane_s16(u_neon_result, 6), + vgetq_lane_s16(v_neon_result, 6)}; + + const Pixel result7 = {vgetq_lane_s16(y_neon_result, 7), vgetq_lane_s16(u_neon_result, 7), + vgetq_lane_s16(v_neon_result, 7)}; + + EXPECT_NEAR(result0.y, expected_values.y.at(0), 1); + EXPECT_NEAR(result0.u, expected_values.u.at(0), 1); + EXPECT_NEAR(result0.v, expected_values.v.at(0), 1); + + EXPECT_NEAR(result1.y, expected_values.y.at(1), 1); + EXPECT_NEAR(result1.u, expected_values.u.at(1), 1); + EXPECT_NEAR(result1.v, expected_values.v.at(1), 1); + + EXPECT_NEAR(result2.y, expected_values.y.at(2), 1); + EXPECT_NEAR(result2.u, expected_values.u.at(2), 1); + EXPECT_NEAR(result2.v, expected_values.v.at(2), 1); + + EXPECT_NEAR(result3.y, expected_values.y.at(3), 1); + EXPECT_NEAR(result3.u, expected_values.u.at(3), 1); + EXPECT_NEAR(result3.v, expected_values.v.at(3), 1); + + EXPECT_NEAR(result4.y, expected_values.y.at(4), 1); + EXPECT_NEAR(result4.u, expected_values.u.at(4), 1); + EXPECT_NEAR(result4.v, expected_values.v.at(4), 1); + + EXPECT_NEAR(result5.y, expected_values.y.at(5), 1); + EXPECT_NEAR(result5.u, expected_values.u.at(5), 1); + EXPECT_NEAR(result5.v, expected_values.v.at(5), 1); + + EXPECT_NEAR(result6.y, expected_values.y.at(6), 1); + EXPECT_NEAR(result6.u, expected_values.u.at(6), 1); + EXPECT_NEAR(result6.v, expected_values.v.at(6), 1); + + EXPECT_NEAR(result7.y, expected_values.y.at(7), 1); + EXPECT_NEAR(result7.u, expected_values.u.at(7), 1); + EXPECT_NEAR(result7.v, expected_values.v.at(7), 1); + } +} +#endif + TEST_F(GainMapMathTest, TransformYuv420) { jpegr_uncompressed_struct input = Yuv420Image(); const size_t buf_size = input.width * input.height * 3 / 2; |