aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/include/ultrahdr/gainmapmath.h21
-rw-r--r--lib/src/dsp/arm/gainmapmath_neon.cpp113
-rw-r--r--tests/gainmapmath_test.cpp147
3 files changed, 281 insertions, 0 deletions
diff --git a/lib/include/ultrahdr/gainmapmath.h b/lib/include/ultrahdr/gainmapmath.h
index 48aea05..686073a 100644
--- a/lib/include/ultrahdr/gainmapmath.h
+++ b/lib/include/ultrahdr/gainmapmath.h
@@ -26,6 +26,10 @@
#include "ultrahdr/ultrahdr.h"
#include "ultrahdr/jpegr.h"
+#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
+#include <arm_neon.h>
+#endif
+
#define CLIP3(x, min, max) ((x) < (min)) ? (min) : ((x) > (max)) ? (max) : (x)
namespace ultrahdr {
@@ -441,6 +445,23 @@ extern const std::array<float, 9> kYuvBt2100ToBt601;
Color yuvColorGamutConversion(Color e_gamma, const std::array<float, 9>& coeffs);
+#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
+
+extern const int16_t kYuv709To601_coeffs_neon[8];
+extern const int16_t kYuv709To2100_coeffs_neon[8];
+extern const int16_t kYuv601To709_coeffs_neon[8];
+extern const int16_t kYuv601To2100_coeffs_neon[8];
+extern const int16_t kYuv2100To709_coeffs_neon[8];
+extern const int16_t kYuv2100To601_coeffs_neon[8];
+
+/*
+ * The Y values are provided at half the width of U & V values to allow use of the widening
+ * arithmetic instructions.
+ */
+int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs);
+
+#endif
+
/*
* Performs a color gamut transformation on an entire YUV420 image.
*
diff --git a/lib/src/dsp/arm/gainmapmath_neon.cpp b/lib/src/dsp/arm/gainmapmath_neon.cpp
new file mode 100644
index 0000000..6536045
--- /dev/null
+++ b/lib/src/dsp/arm/gainmapmath_neon.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2024 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ultrahdr/gainmapmath.h"
+
+#include <arm_neon.h>
+
+namespace ultrahdr {
+
+// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
+// by one error compared to the scalar floating-point implementation.
+
+// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
+// Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
+// {Y1, Y2, U1, U2, V1, V2, 0, 0}
+
+// Yuv Bt709 -> Yuv Bt601
+// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
+// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
+// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};
+
+// Yuv Bt709 -> Yuv Bt2100
+// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
+// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
+// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};
+
+// Yuv Bt601 -> Yuv Bt709
+// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
+// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
+// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
+__attribute__((aligned(16)))
+const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};
+
+// Yuv Bt601 -> Yuv Bt2100
+// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
+// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
+// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};
+
+// Yuv Bt2100 -> Yuv Bt709
+// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
+// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
+// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};
+
+// Yuv Bt2100 -> Yuv Bt601
+// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
+// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
+// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};
+
+static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
+ int32x4_t lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 0);
+ int32x4_t hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 0);
+ lo = vmlal_laneq_s16(lo, vget_low_s16(v), coeffs, 1);
+ hi = vmlal_laneq_s16(hi, vget_high_s16(v), coeffs, 1);
+
+ // Descale result to account for coefficients being scaled by 2^14.
+ uint16x8_t y_output =
+ vreinterpretq_u16_s16(vcombine_s16(vqrshrn_n_s32(lo, 14), vqrshrn_n_s32(hi, 14)));
+ return vreinterpretq_s16_u16(vaddw_u8(y_output, y));
+}
+
+static inline int16x8_t uConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
+ int32x4_t u_lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 2);
+ int32x4_t u_hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 2);
+ u_lo = vmlal_laneq_s16(u_lo, vget_low_s16(v), coeffs, 3);
+ u_hi = vmlal_laneq_s16(u_hi, vget_high_s16(v), coeffs, 3);
+
+ // Descale result to account for coefficients being scaled by 2^14.
+ const int16x8_t u_output = vcombine_s16(vqrshrn_n_s32(u_lo, 14), vqrshrn_n_s32(u_hi, 14));
+ return u_output;
+}
+
+static inline int16x8_t vConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
+ int32x4_t v_lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 4);
+ int32x4_t v_hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 4);
+ v_lo = vmlal_laneq_s16(v_lo, vget_low_s16(v), coeffs, 5);
+ v_hi = vmlal_laneq_s16(v_hi, vget_high_s16(v), coeffs, 5);
+
+ // Descale result to account for coefficients being scaled by 2^14.
+ const int16x8_t v_output = vcombine_s16(vqrshrn_n_s32(v_lo, 14), vqrshrn_n_s32(v_hi, 14));
+ return v_output;
+}
+
+int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
+ const int16x8_t y_output = yConversion_neon(y, u, v, coeffs);
+ const int16x8_t u_output = uConversion_neon(u, v, coeffs);
+ const int16x8_t v_output = vConversion_neon(u, v, coeffs);
+ return {y_output, u_output, v_output};
+}
+
+} // namespace ultrahdr
diff --git a/tests/gainmapmath_test.cpp b/tests/gainmapmath_test.cpp
index a602801..a1d61ce 100644
--- a/tests/gainmapmath_test.cpp
+++ b/tests/gainmapmath_test.cpp
@@ -97,6 +97,31 @@ class GainMapMathTest : public testing::Test {
Color Bt2100YuvGreen() { return {{{0.6780f, -0.36037f, -0.45979f}}}; }
Color Bt2100YuvBlue() { return {{{0.0593f, 0.5f, -0.04021f}}}; }
+ //////////////////////////////////////////////////////////////////////////////
+ // Reference values for when using fixed-point arithmetic.
+
+ Pixel RgbBlackPixel() { return {0, 0, 0}; }
+ Pixel RgbWhitePixel() { return {255, 255, 255}; }
+
+ Pixel RgbRedPixel() { return {255, 0, 0}; }
+ Pixel RgbGreenPixel() { return {0, 255, 0}; }
+ Pixel RgbBluePixel() { return {0, 0, 255}; }
+
+ Pixel YuvBlackPixel() { return {0, 0, 0}; }
+ Pixel YuvWhitePixel() { return {255, 0, 0}; }
+
+ Pixel SrgbYuvRedPixel() { return {54, -29, 128}; }
+ Pixel SrgbYuvGreenPixel() { return {182, -98, -116}; }
+ Pixel SrgbYuvBluePixel() { return {18, 128, -12}; }
+
+ Pixel P3YuvRedPixel() { return {76, -43, 128}; }
+ Pixel P3YuvGreenPixel() { return {150, -84, -107}; }
+ Pixel P3YuvBluePixel() { return {29, 128, -21}; }
+
+ Pixel Bt2100YuvRedPixel() { return {67, -36, 128}; }
+ Pixel Bt2100YuvGreenPixel() { return {173, -92, -117}; }
+ Pixel Bt2100YuvBluePixel() { return {15, 128, -10}; }
+
float SrgbYuvToLuminance(Color yuv_gamma, ColorCalculationFn luminanceFn) {
Color rgb_gamma = srgbYuvToRgb(yuv_gamma);
Color rgb = srgbInvOetf(rgb_gamma);
@@ -655,6 +680,128 @@ TEST_F(GainMapMathTest, YuvColorGamutConversion) {
}
}
+#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
+TEST_F(GainMapMathTest, YuvConversionNeon) {
+ const std::array<Pixel, 5> SrgbYuvColors{YuvBlackPixel(), YuvWhitePixel(), SrgbYuvRedPixel(),
+ SrgbYuvGreenPixel(), SrgbYuvBluePixel()};
+
+ const std::array<Pixel, 5> P3YuvColors{YuvBlackPixel(), YuvWhitePixel(), P3YuvRedPixel(),
+ P3YuvGreenPixel(), P3YuvBluePixel()};
+
+ const std::array<Pixel, 5> Bt2100YuvColors{YuvBlackPixel(), YuvWhitePixel(), Bt2100YuvRedPixel(),
+ Bt2100YuvGreenPixel(), Bt2100YuvBluePixel()};
+
+ struct InputSamples {
+ std::array<uint8_t, 8> y;
+ std::array<int16_t, 8> u;
+ std::array<int16_t, 8> v;
+ };
+
+ struct ExpectedSamples {
+ std::array<int16_t, 8> y;
+ std::array<int16_t, 8> u;
+ std::array<int16_t, 8> v;
+ };
+
+ // Each tuple contains three elements.
+ // 0. A pointer to the coefficients that will be passed to the Neon implementation
+ // 1. Input pixel/color array
+ // 2. The expected results
+ const std::array<
+ std::tuple<const int16_t*, const std::array<Pixel, 5>, const std::array<Pixel, 5>>, 6>
+ coeffs_setup_correct{{
+ {kYuv709To601_coeffs_neon, SrgbYuvColors, P3YuvColors},
+ {kYuv709To2100_coeffs_neon, SrgbYuvColors, Bt2100YuvColors},
+ {kYuv601To709_coeffs_neon, P3YuvColors, SrgbYuvColors},
+ {kYuv601To2100_coeffs_neon, P3YuvColors, Bt2100YuvColors},
+ {kYuv2100To709_coeffs_neon, Bt2100YuvColors, SrgbYuvColors},
+ {kYuv2100To601_coeffs_neon, Bt2100YuvColors, P3YuvColors},
+ }};
+
+ for (const auto& [coeff_ptr, input, expected] : coeffs_setup_correct) {
+ const int16x8_t coeffs = vld1q_s16(coeff_ptr);
+ InputSamples input_values;
+ ExpectedSamples expected_values;
+ for (size_t sample_idx = 0; sample_idx < 8; ++sample_idx) {
+ size_t ring_idx = sample_idx % input.size();
+ input_values.y.at(sample_idx) = static_cast<uint8_t>(input.at(ring_idx).y);
+ input_values.u.at(sample_idx) = input.at(ring_idx).u;
+ input_values.v.at(sample_idx) = input.at(ring_idx).v;
+
+ expected_values.y.at(sample_idx) = expected.at(ring_idx).y;
+ expected_values.u.at(sample_idx) = expected.at(ring_idx).u;
+ expected_values.v.at(sample_idx) = expected.at(ring_idx).v;
+ }
+
+ const uint8x8_t y_neon = vld1_u8(input_values.y.data());
+ const int16x8_t u_neon = vld1q_s16(input_values.u.data());
+ const int16x8_t v_neon = vld1q_s16(input_values.v.data());
+
+ const int16x8x3_t neon_result = yuvConversion_neon(y_neon, u_neon, v_neon, coeffs);
+
+ const int16x8_t y_neon_result = neon_result.val[0];
+ const int16x8_t u_neon_result = neon_result.val[1];
+ const int16x8_t v_neon_result = neon_result.val[2];
+
+ const Pixel result0 = {vgetq_lane_s16(y_neon_result, 0), vgetq_lane_s16(u_neon_result, 0),
+ vgetq_lane_s16(v_neon_result, 0)};
+
+ const Pixel result1 = {vgetq_lane_s16(y_neon_result, 1), vgetq_lane_s16(u_neon_result, 1),
+ vgetq_lane_s16(v_neon_result, 1)};
+
+ const Pixel result2 = {vgetq_lane_s16(y_neon_result, 2), vgetq_lane_s16(u_neon_result, 2),
+ vgetq_lane_s16(v_neon_result, 2)};
+
+ const Pixel result3 = {vgetq_lane_s16(y_neon_result, 3), vgetq_lane_s16(u_neon_result, 3),
+ vgetq_lane_s16(v_neon_result, 3)};
+
+ const Pixel result4 = {vgetq_lane_s16(y_neon_result, 4), vgetq_lane_s16(u_neon_result, 4),
+ vgetq_lane_s16(v_neon_result, 4)};
+
+ const Pixel result5 = {vgetq_lane_s16(y_neon_result, 5), vgetq_lane_s16(u_neon_result, 5),
+ vgetq_lane_s16(v_neon_result, 5)};
+
+ const Pixel result6 = {vgetq_lane_s16(y_neon_result, 6), vgetq_lane_s16(u_neon_result, 6),
+ vgetq_lane_s16(v_neon_result, 6)};
+
+ const Pixel result7 = {vgetq_lane_s16(y_neon_result, 7), vgetq_lane_s16(u_neon_result, 7),
+ vgetq_lane_s16(v_neon_result, 7)};
+
+ EXPECT_NEAR(result0.y, expected_values.y.at(0), 1);
+ EXPECT_NEAR(result0.u, expected_values.u.at(0), 1);
+ EXPECT_NEAR(result0.v, expected_values.v.at(0), 1);
+
+ EXPECT_NEAR(result1.y, expected_values.y.at(1), 1);
+ EXPECT_NEAR(result1.u, expected_values.u.at(1), 1);
+ EXPECT_NEAR(result1.v, expected_values.v.at(1), 1);
+
+ EXPECT_NEAR(result2.y, expected_values.y.at(2), 1);
+ EXPECT_NEAR(result2.u, expected_values.u.at(2), 1);
+ EXPECT_NEAR(result2.v, expected_values.v.at(2), 1);
+
+ EXPECT_NEAR(result3.y, expected_values.y.at(3), 1);
+ EXPECT_NEAR(result3.u, expected_values.u.at(3), 1);
+ EXPECT_NEAR(result3.v, expected_values.v.at(3), 1);
+
+ EXPECT_NEAR(result4.y, expected_values.y.at(4), 1);
+ EXPECT_NEAR(result4.u, expected_values.u.at(4), 1);
+ EXPECT_NEAR(result4.v, expected_values.v.at(4), 1);
+
+ EXPECT_NEAR(result5.y, expected_values.y.at(5), 1);
+ EXPECT_NEAR(result5.u, expected_values.u.at(5), 1);
+ EXPECT_NEAR(result5.v, expected_values.v.at(5), 1);
+
+ EXPECT_NEAR(result6.y, expected_values.y.at(6), 1);
+ EXPECT_NEAR(result6.u, expected_values.u.at(6), 1);
+ EXPECT_NEAR(result6.v, expected_values.v.at(6), 1);
+
+ EXPECT_NEAR(result7.y, expected_values.y.at(7), 1);
+ EXPECT_NEAR(result7.u, expected_values.u.at(7), 1);
+ EXPECT_NEAR(result7.v, expected_values.v.at(7), 1);
+ }
+}
+#endif
+
TEST_F(GainMapMathTest, TransformYuv420) {
jpegr_uncompressed_struct input = Yuv420Image();
const size_t buf_size = input.width * input.height * 3 / 2;