string: Reduce alignment in strncmp

There were nops before the beginning of the function to place the main loop on a 64-byte boundary, but the addition of BTI and instructions for ILP32 has corrupted that. As per review, drop 64-byte alignment entirely, and use the default 16-byte alignment from ENTRY.
author: Richard Henderson <richard.henderson@linaro.org> 2020-12-29 16:16:26 -0800
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2021-01-04 12:21:30 +0000
commit: 6695ad5da06a6fbdefffda647f192a784a275315 (patch)
tree: ed63f188adb5e06f60905496db3f29921bfb5b07
parent: bf7f25b1758fc9c6bc42a20bd690a5d64b27ccf3 (diff)
download: arm-optimized-routines-6695ad5da06a6fbdefffda647f192a784a275315.tar.gz
2 files changed, 8 insertions, 16 deletions
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
index 42f1a3b..c6dbe0b 100644
--- a/string/aarch64/strncmp-mte.S
+++ b/string/aarch64/strncmp-mte.S
@@ -54,11 +54,7 @@
 #endif
 
 	.text
-	.p2align 6
-	.rep 9
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
+ENTRY (__strncmp_aarch64_mte)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
@@ -73,7 +69,7 @@ ENTRY_ALIGN (__strncmp_aarch64_mte, 0)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
+	.p2align 4
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -86,7 +82,7 @@ L(start_realigned):
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
+	/* End of main loop */
 
 L(full_check):
 #ifndef __AARCH64EB__
@@ -178,7 +174,7 @@ L(mutual_align):
 	orr	data2, data2, tmp2
 	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
 	cmp	limit, #16
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index b8824b8..52f2396 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -41,11 +41,7 @@
 #define count		mask
 
 	.text
-	.p2align 6
-	.rep 6
-	nop	/* Pad so that the loop below fits a cache line.  */
-	.endr
-ENTRY_ALIGN (__strncmp_aarch64, 0)
+ENTRY (__strncmp_aarch64)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
@@ -63,7 +59,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0)
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
-	/* Start of performance-critical section  -- one 64B cache line.  */
+	.p2align 4
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -76,7 +72,7 @@ L(start_realigned):
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
 	b.eq	L(loop_aligned)
-	/* End of performance-critical section  -- one 64B cache line.  */
+	/* End of main loop */
 
 	/* Not reached the limit, must have found the end or a diff.  */
 	tbz	limit_wd, #63, L(not_limit)
@@ -181,7 +177,7 @@ L(mutual_align):
 	add	limit_wd, limit_wd, tmp3, lsr #3
 	b	L(start_realigned)
 
-	.p2align 6
+	.p2align 4
 	/* Don't bother with dwords for up to 16 bytes.  */
 L(misaligned8):
 	cmp	limit, #16
author	Richard Henderson <richard.henderson@linaro.org>	2020-12-29 16:16:26 -0800
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2021-01-04 12:21:30 +0000
commit	6695ad5da06a6fbdefffda647f192a784a275315 (patch)
tree	ed63f188adb5e06f60905496db3f29921bfb5b07
parent	bf7f25b1758fc9c6bc42a20bd690a5d64b27ccf3 (diff)
download	arm-optimized-routines-6695ad5da06a6fbdefffda647f192a784a275315.tar.gz