3 files changed, 281 insertions, 0 deletions
diff --git a/lib/include/ultrahdr/gainmapmath.h b/lib/include/ultrahdr/gainmapmath.h
index 48aea05..686073a 100644
--- a/lib/include/ultrahdr/gainmapmath.h
+++ b/lib/include/ultrahdr/gainmapmath.h
@@ -26,6 +26,10 @@
 #include "ultrahdr/ultrahdr.h"
 #include "ultrahdr/jpegr.h"
 
+#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
+#include <arm_neon.h>
+#endif
+
 #define CLIP3(x, min, max) ((x) < (min)) ? (min) : ((x) > (max)) ? (max) : (x)
 
 namespace ultrahdr {
@@ -441,6 +445,23 @@ extern const std::array<float, 9> kYuvBt2100ToBt601;
 
 Color yuvColorGamutConversion(Color e_gamma, const std::array<float, 9>& coeffs);
 
+#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
+
+extern const int16_t kYuv709To601_coeffs_neon[8];
+extern const int16_t kYuv709To2100_coeffs_neon[8];
+extern const int16_t kYuv601To709_coeffs_neon[8];
+extern const int16_t kYuv601To2100_coeffs_neon[8];
+extern const int16_t kYuv2100To709_coeffs_neon[8];
+extern const int16_t kYuv2100To601_coeffs_neon[8];
+
+/*
+ * The Y values are provided at half the width of U & V values to allow use of the widening
+ * arithmetic instructions.
+ */
+int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs);
+
+#endif
+
 /*
  * Performs a color gamut transformation on an entire YUV420 image.
  *
diff --git a/lib/src/dsp/arm/gainmapmath_neon.cpp b/lib/src/dsp/arm/gainmapmath_neon.cpp
new file mode 100644
index 0000000..6536045
--- /dev/null
+++ b/lib/src/dsp/arm/gainmapmath_neon.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2024 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ultrahdr/gainmapmath.h"
+
+#include <arm_neon.h>
+
+namespace ultrahdr {
+
+// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
+// by one error compared to the scalar floating-point implementation.
+
+// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
+// Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
+// {Y1, Y2, U1, U2, V1, V2, 0, 0}
+
+// Yuv Bt709 -> Yuv Bt601
+// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
+// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
+// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};
+
+// Yuv Bt709 -> Yuv Bt2100
+// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
+// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
+// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};
+
+// Yuv Bt601 -> Yuv Bt709
+// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
+// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
+// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
+__attribute__((aligned(16)))
+const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};
+
+// Yuv Bt601 -> Yuv Bt2100
+// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
+// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
+// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};
+
+// Yuv Bt2100 -> Yuv Bt709
+// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
+// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
+// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};
+
+// Yuv Bt2100 -> Yuv Bt601
+// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
+// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
+// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
+__attribute__((aligned(16)))
+const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};
+
+static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
+  int32x4_t lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 0);
+  int32x4_t hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 0);
+  lo = vmlal_laneq_s16(lo, vget_low_s16(v), coeffs, 1);
+  hi = vmlal_laneq_s16(hi, vget_high_s16(v), coeffs, 1);
+
+  // Descale result to account for coefficients being scaled by 2^14.
+  uint16x8_t y_output =
+      vreinterpretq_u16_s16(vcombine_s16(vqrshrn_n_s32(lo, 14), vqrshrn_n_s32(hi, 14)));
+  return vreinterpretq_s16_u16(vaddw_u8(y_output, y));
+}
+
+static inline int16x8_t uConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
+  int32x4_t u_lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 2);
+  int32x4_t u_hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 2);
+  u_lo = vmlal_laneq_s16(u_lo, vget_low_s16(v), coeffs, 3);
+  u_hi = vmlal_laneq_s16(u_hi, vget_high_s16(v), coeffs, 3);
+
+  // Descale result to account for coefficients being scaled by 2^14.
+  const int16x8_t u_output = vcombine_s16(vqrshrn_n_s32(u_lo, 14), vqrshrn_n_s32(u_hi, 14));
+  return u_output;
+}
+
+static inline int16x8_t vConversion_neon(int16x8_t u, int16x8_t v, int16x8_t coeffs) {
+  int32x4_t v_lo = vmull_laneq_s16(vget_low_s16(u), coeffs, 4);
+  int32x4_t v_hi = vmull_laneq_s16(vget_high_s16(u), coeffs, 4);
+  v_lo = vmlal_laneq_s16(v_lo, vget_low_s16(v), coeffs, 5);
+  v_hi = vmlal_laneq_s16(v_hi, vget_high_s16(v), coeffs, 5);
+
+  // Descale result to account for coefficients being scaled by 2^14.
+  const int16x8_t v_output = vcombine_s16(vqrshrn_n_s32(v_lo, 14), vqrshrn_n_s32(v_hi, 14));
+  return v_output;
+}
+
+int16x8x3_t yuvConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
+  const int16x8_t y_output = yConversion_neon(y, u, v, coeffs);
+  const int16x8_t u_output = uConversion_neon(u, v, coeffs);
+  const int16x8_t v_output = vConversion_neon(u, v, coeffs);
+  return {y_output, u_output, v_output};
+}
+
+}  // namespace ultrahdr
diff --git a/tests/gainmapmath_test.cpp b/tests/gainmapmath_test.cpp
index a602801..a1d61ce 100644
--- a/tests/gainmapmath_test.cpp
+++ b/tests/gainmapmath_test.cpp
@@ -97,6 +97,31 @@ class GainMapMathTest : public testing::Test {
   Color Bt2100YuvGreen() { return {{{0.6780f, -0.36037f, -0.45979f}}}; }
   Color Bt2100YuvBlue() { return {{{0.0593f, 0.5f, -0.04021f}}}; }
 
+  //////////////////////////////////////////////////////////////////////////////
+  // Reference values for when using fixed-point arithmetic.
+
+  Pixel RgbBlackPixel() { return {0, 0, 0}; }
+  Pixel RgbWhitePixel() { return {255, 255, 255}; }
+
+  Pixel RgbRedPixel() { return {255, 0, 0}; }
+  Pixel RgbGreenPixel() { return {0, 255, 0}; }
+  Pixel RgbBluePixel() { return {0, 0, 255}; }
+
+  Pixel YuvBlackPixel() { return {0, 0, 0}; }
+  Pixel YuvWhitePixel() { return {255, 0, 0}; }
+
+  Pixel SrgbYuvRedPixel() { return {54, -29, 128}; }
+  Pixel SrgbYuvGreenPixel() { return {182, -98, -116}; }
+  Pixel SrgbYuvBluePixel() { return {18, 128, -12}; }
+
+  Pixel P3YuvRedPixel() { return {76, -43, 128}; }
+  Pixel P3YuvGreenPixel() { return {150, -84, -107}; }
+  Pixel P3YuvBluePixel() { return {29, 128, -21}; }
+
+  Pixel Bt2100YuvRedPixel() { return {67, -36, 128}; }
+  Pixel Bt2100YuvGreenPixel() { return {173, -92, -117}; }
+  Pixel Bt2100YuvBluePixel() { return {15, 128, -10}; }
+
   float SrgbYuvToLuminance(Color yuv_gamma, ColorCalculationFn luminanceFn) {
     Color rgb_gamma = srgbYuvToRgb(yuv_gamma);
     Color rgb = srgbInvOetf(rgb_gamma);
@@ -655,6 +680,128 @@ TEST_F(GainMapMathTest, YuvColorGamutConversion) {
   }
 }
 
+#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
+TEST_F(GainMapMathTest, YuvConversionNeon) {
+  const std::array<Pixel, 5> SrgbYuvColors{YuvBlackPixel(), YuvWhitePixel(), SrgbYuvRedPixel(),
+                                           SrgbYuvGreenPixel(), SrgbYuvBluePixel()};
+
+  const std::array<Pixel, 5> P3YuvColors{YuvBlackPixel(), YuvWhitePixel(), P3YuvRedPixel(),
+                                         P3YuvGreenPixel(), P3YuvBluePixel()};
+
+  const std::array<Pixel, 5> Bt2100YuvColors{YuvBlackPixel(), YuvWhitePixel(), Bt2100YuvRedPixel(),
+                                             Bt2100YuvGreenPixel(), Bt2100YuvBluePixel()};
+
+  struct InputSamples {
+    std::array<uint8_t, 8> y;
+    std::array<int16_t, 8> u;
+    std::array<int16_t, 8> v;
+  };
+
+  struct ExpectedSamples {
+    std::array<int16_t, 8> y;
+    std::array<int16_t, 8> u;
+    std::array<int16_t, 8> v;
+  };
+
+  // Each tuple contains three elements.
+  // 0. A pointer to the coefficients that will be passed to the Neon implementation
+  // 1. Input pixel/color array
+  // 2. The expected results
+  const std::array<
+      std::tuple<const int16_t*, const std::array<Pixel, 5>, const std::array<Pixel, 5>>, 6>
+      coeffs_setup_correct{{
+          {kYuv709To601_coeffs_neon, SrgbYuvColors, P3YuvColors},
+          {kYuv709To2100_coeffs_neon, SrgbYuvColors, Bt2100YuvColors},
+          {kYuv601To709_coeffs_neon, P3YuvColors, SrgbYuvColors},
+          {kYuv601To2100_coeffs_neon, P3YuvColors, Bt2100YuvColors},
+          {kYuv2100To709_coeffs_neon, Bt2100YuvColors, SrgbYuvColors},
+          {kYuv2100To601_coeffs_neon, Bt2100YuvColors, P3YuvColors},
+      }};
+
+  for (const auto& [coeff_ptr, input, expected] : coeffs_setup_correct) {
+    const int16x8_t coeffs = vld1q_s16(coeff_ptr);
+    InputSamples input_values;
+    ExpectedSamples expected_values;
+    for (size_t sample_idx = 0; sample_idx < 8; ++sample_idx) {
+      size_t ring_idx = sample_idx % input.size();
+      input_values.y.at(sample_idx) = static_cast<uint8_t>(input.at(ring_idx).y);
+      input_values.u.at(sample_idx) = input.at(ring_idx).u;
+      input_values.v.at(sample_idx) = input.at(ring_idx).v;
+
+      expected_values.y.at(sample_idx) = expected.at(ring_idx).y;
+      expected_values.u.at(sample_idx) = expected.at(ring_idx).u;
+      expected_values.v.at(sample_idx) = expected.at(ring_idx).v;
+    }
+
+    const uint8x8_t y_neon = vld1_u8(input_values.y.data());
+    const int16x8_t u_neon = vld1q_s16(input_values.u.data());
+    const int16x8_t v_neon = vld1q_s16(input_values.v.data());
+
+    const int16x8x3_t neon_result = yuvConversion_neon(y_neon, u_neon, v_neon, coeffs);
+
+    const int16x8_t y_neon_result = neon_result.val[0];
+    const int16x8_t u_neon_result = neon_result.val[1];
+    const int16x8_t v_neon_result = neon_result.val[2];
+
+    const Pixel result0 = {vgetq_lane_s16(y_neon_result, 0), vgetq_lane_s16(u_neon_result, 0),
+                           vgetq_lane_s16(v_neon_result, 0)};
+
+    const Pixel result1 = {vgetq_lane_s16(y_neon_result, 1), vgetq_lane_s16(u_neon_result, 1),
+                           vgetq_lane_s16(v_neon_result, 1)};
+
+    const Pixel result2 = {vgetq_lane_s16(y_neon_result, 2), vgetq_lane_s16(u_neon_result, 2),
+                           vgetq_lane_s16(v_neon_result, 2)};
+
+    const Pixel result3 = {vgetq_lane_s16(y_neon_result, 3), vgetq_lane_s16(u_neon_result, 3),
+                           vgetq_lane_s16(v_neon_result, 3)};
+
+    const Pixel result4 = {vgetq_lane_s16(y_neon_result, 4), vgetq_lane_s16(u_neon_result, 4),
+                           vgetq_lane_s16(v_neon_result, 4)};
+
+    const Pixel result5 = {vgetq_lane_s16(y_neon_result, 5), vgetq_lane_s16(u_neon_result, 5),
+                           vgetq_lane_s16(v_neon_result, 5)};
+
+    const Pixel result6 = {vgetq_lane_s16(y_neon_result, 6), vgetq_lane_s16(u_neon_result, 6),
+                           vgetq_lane_s16(v_neon_result, 6)};
+
+    const Pixel result7 = {vgetq_lane_s16(y_neon_result, 7), vgetq_lane_s16(u_neon_result, 7),
+                           vgetq_lane_s16(v_neon_result, 7)};
+
+    EXPECT_NEAR(result0.y, expected_values.y.at(0), 1);
+    EXPECT_NEAR(result0.u, expected_values.u.at(0), 1);
+    EXPECT_NEAR(result0.v, expected_values.v.at(0), 1);
+
+    EXPECT_NEAR(result1.y, expected_values.y.at(1), 1);
+    EXPECT_NEAR(result1.u, expected_values.u.at(1), 1);
+    EXPECT_NEAR(result1.v, expected_values.v.at(1), 1);
+
+    EXPECT_NEAR(result2.y, expected_values.y.at(2), 1);
+    EXPECT_NEAR(result2.u, expected_values.u.at(2), 1);
+    EXPECT_NEAR(result2.v, expected_values.v.at(2), 1);
+
+    EXPECT_NEAR(result3.y, expected_values.y.at(3), 1);
+    EXPECT_NEAR(result3.u, expected_values.u.at(3), 1);
+    EXPECT_NEAR(result3.v, expected_values.v.at(3), 1);
+
+    EXPECT_NEAR(result4.y, expected_values.y.at(4), 1);
+    EXPECT_NEAR(result4.u, expected_values.u.at(4), 1);
+    EXPECT_NEAR(result4.v, expected_values.v.at(4), 1);
+
+    EXPECT_NEAR(result5.y, expected_values.y.at(5), 1);
+    EXPECT_NEAR(result5.u, expected_values.u.at(5), 1);
+    EXPECT_NEAR(result5.v, expected_values.v.at(5), 1);
+
+    EXPECT_NEAR(result6.y, expected_values.y.at(6), 1);
+    EXPECT_NEAR(result6.u, expected_values.u.at(6), 1);
+    EXPECT_NEAR(result6.v, expected_values.v.at(6), 1);
+
+    EXPECT_NEAR(result7.y, expected_values.y.at(7), 1);
+    EXPECT_NEAR(result7.u, expected_values.u.at(7), 1);
+    EXPECT_NEAR(result7.v, expected_values.v.at(7), 1);
+  }
+}
+#endif
+
 TEST_F(GainMapMathTest, TransformYuv420) {
   jpegr_uncompressed_struct input = Yuv420Image();
   const size_t buf_size = input.width * input.height * 3 / 2;