[AArch64] Load full vectors in ARGB{Add,Subtract}Row

Using full vectors for Add and Subtract is a win across the board. Using full vectors for the multiply is less obviously a win, especially for smaller cores like Cortex-A53 or Cortex-A57, so is not considered for this change. Observed changes in performance with this change compared to the existing Neon code: | ARGBAddRow_NEON | ARGBSubtractRow_NEON Cortex-A55 | -5.1% | -5.1% Cortex-A510 | -18.4% | -18.4% Cortex-A76 | -28.9% | -28.7% Cortex-A720 | -36.1% | -36.2% Cortex-X1 | -14.2% | -14.4% Cortex-X2 | -12.5% | -12.5% Bug: libyuv:976 Change-Id: I85316d4399c93b53baa62d0d43b2fa453517f5b4 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5457433 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
author: George Steed <george.steed@arm.com> 2024-04-16 10:13:39 +0100
committer: Frank Barchard <fbarchard@chromium.org> 2024-04-18 19:02:43 +0000
commit: 4838e7a194592c026e18cbe9f83a08e07a0ce95b (patch)
tree: 621350012f3c7e3b2e6fb9d7aea578d90e757ac1
parent: 90070986aeac1129aa7632d986d636d3d29d5859 (diff)
download: libyuv-4838e7a194592c026e18cbe9f83a08e07a0ce95b.tar.gz
1 files changed, 10 insertions, 14 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index f9b34a49..dd0e7b77 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -3819,16 +3819,14 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "ld1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld1         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB
+      "ldp         q4, q5, [%1], #32             \n"  // load 8 more
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uqadd       v0.8b, v0.8b, v4.8b           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "uqadd       v1.8b, v1.8b, v5.8b           \n"
       "prfm        pldl1keep, [%1, 448]          \n"
-      "uqadd       v2.8b, v2.8b, v6.8b           \n"
-      "uqadd       v3.8b, v3.8b, v7.8b           \n"
-      "st1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "uqadd       v0.16b, v0.16b, v4.16b        \n"
+      "uqadd       v1.16b, v1.16b, v5.16b        \n"
+      "stp         q0, q1, [%2], #32             \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
@@ -3846,16 +3844,14 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
   asm volatile(
       // 8 pixel loop.
       "1:                                        \n"
-      "ld1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
-      "ld1         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB
+      "ldp         q4, q5, [%1], #32             \n"  // load 8 more
       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
-      "uqsub       v0.8b, v0.8b, v4.8b           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "uqsub       v1.8b, v1.8b, v5.8b           \n"
       "prfm        pldl1keep, [%1, 448]          \n"
-      "uqsub       v2.8b, v2.8b, v6.8b           \n"
-      "uqsub       v3.8b, v3.8b, v7.8b           \n"
-      "st1         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "uqsub       v0.16b, v0.16b, v4.16b        \n"
+      "uqsub       v1.16b, v1.16b, v5.16b        \n"
+      "stp         q0, q1, [%2], #32             \n"  // store 8 ARGB
       "b.gt        1b                            \n"
       : "+r"(src_argb),   // %0
         "+r"(src_argb1),  // %1
author	George Steed <george.steed@arm.com>	2024-04-16 10:13:39 +0100
committer	Frank Barchard <fbarchard@chromium.org>	2024-04-18 19:02:43 +0000
commit	4838e7a194592c026e18cbe9f83a08e07a0ce95b (patch)
tree	621350012f3c7e3b2e6fb9d7aea578d90e757ac1
parent	90070986aeac1129aa7632d986d636d3d29d5859 (diff)
download	libyuv-4838e7a194592c026e18cbe9f83a08e07a0ce95b.tar.gz