aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2020-12-29 16:16:26 -0800
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2021-01-04 12:21:30 +0000
commit6695ad5da06a6fbdefffda647f192a784a275315 (patch)
treeed63f188adb5e06f60905496db3f29921bfb5b07
parentbf7f25b1758fc9c6bc42a20bd690a5d64b27ccf3 (diff)
downloadarm-optimized-routines-6695ad5da06a6fbdefffda647f192a784a275315.tar.gz
string: Reduce alignment in strncmp
There were nops before the beginning of the function to place the main loop on a 64-byte boundary, but the addition of BTI and instructions for ILP32 has corrupted that. As per review, drop 64-byte alignment entirely, and use the default 16-byte alignment from ENTRY.
-rw-r--r--string/aarch64/strncmp-mte.S12
-rw-r--r--string/aarch64/strncmp.S12
2 files changed, 8 insertions, 16 deletions
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
index 42f1a3b..c6dbe0b 100644
--- a/string/aarch64/strncmp-mte.S
+++ b/string/aarch64/strncmp-mte.S
@@ -54,11 +54,7 @@
#endif
.text
- .p2align 6
- .rep 9
- nop /* Pad so that the loop below fits a cache line. */
- .endr
-ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
+ENTRY (__strncmp_aarch64_mte)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
@@ -73,7 +69,7 @@ ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
- /* Start of performance-critical section -- one 64B cache line. */
+ .p2align 4
L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
@@ -86,7 +82,7 @@ L(start_realigned):
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
- /* End of performance-critical section -- one 64B cache line. */
+ /* End of main loop */
L(full_check):
#ifndef __AARCH64EB__
@@ -178,7 +174,7 @@ L(mutual_align):
orr data2, data2, tmp2
b L(start_realigned)
- .p2align 6
+ .p2align 4
/* Don't bother with dwords for up to 16 bytes. */
L(misaligned8):
cmp limit, #16
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index b8824b8..52f2396 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -41,11 +41,7 @@
#define count mask
.text
- .p2align 6
- .rep 6
- nop /* Pad so that the loop below fits a cache line. */
- .endr
-ENTRY_ALIGN (__strncmp_aarch64, 0)
+ENTRY (__strncmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
@@ -63,7 +59,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
- /* Start of performance-critical section -- one 64B cache line. */
+ .p2align 4
L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
@@ -76,7 +72,7 @@ L(start_realigned):
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
- /* End of performance-critical section -- one 64B cache line. */
+ /* End of main loop */
/* Not reached the limit, must have found the end or a diff. */
tbz limit_wd, #63, L(not_limit)
@@ -181,7 +177,7 @@ L(mutual_align):
add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
- .p2align 6
+ .p2align 4
/* Don't bother with dwords for up to 16 bytes. */
L(misaligned8):
cmp limit, #16