aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Steed <george.steed@arm.com>2024-03-12 17:05:08 +0000
committerlibyuv LUCI CQ <libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com>2024-03-13 18:35:31 +0000
commit188e4e3afbfe90f6f13f59db6c929762de07921b (patch)
treed719993a8eec9279ca5cb130cc6debf79cdb8241
parent772bddaed79860d05f5d3b5ade09ec251b8cbbdb (diff)
downloadlibyuv-188e4e3afbfe90f6f13f59db6c929762de07921b.tar.gz
[AArch64] Avoid unnecessary lane-indexed loads in READYUV
The existing code makes use of a pair of lane-indexed load instructions to fill the two halves of the input vector, however this has the effect of introducing an unnecessary dependency on the value of the vector from the previous loop iteration. This doesn't really seem to affect little core performance since these cores never execute enough work concurrently to hit the bottleneck, however we can improve performance on mid and big cores quite a bit by using LDR instead of LD1 to load the low lane, zeroing the upper portion of the vector rather than keeping the previous value. Reduction in runtime for select kernels (no observed performance delta on Cortex-A55): Kernel | Cortex-A76 | Cortex-X2 I422ToARGB4444Row_NEON | -23.1% | -49.3% I422ToARGBRow_NEON | -1.2% | -2.5% I422ToRGB24Row_NEON | -11.7% | -7.0% I422ToRGBARow_NEON | -4.7% | -3.4% I444AlphaToARGBRow_NEON | -1.1% | -2.4% I444ToARGBRow_NEON | -1.6% | -3.2% I444ToRGB24Row_NEON | -9.6% | -6.8% Bug: libyuv:976 Change-Id: I8c9413e0e6ed97b8f060ce42b6e8abdfb77914b9 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5365868 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
-rw-r--r--source/row_neon64.cc4
1 files changed, 2 insertions, 2 deletions
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 1679f87c..a51b51aa 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -28,7 +28,7 @@ extern "C" {
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
"ldr d0, [%[src_y]], #8 \n" \
- "ld1 {v1.s}[0], [%[src_u]], #4 \n" \
+ "ldr s1, [%[src_u]], #4 \n" \
"ld1 {v1.s}[1], [%[src_v]], #4 \n" \
"zip1 v0.16b, v0.16b, v0.16b \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
@@ -39,7 +39,7 @@ extern "C" {
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
"ldr d0, [%[src_y]], #8 \n" \
- "ld1 {v1.d}[0], [%[src_u]], #8 \n" \
+ "ldr d1, [%[src_u]], #8 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"ld1 {v1.d}[1], [%[src_v]], #8 \n" \
"prfm pldl1keep, [%[src_u], 448] \n" \