diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2020-12-29 16:16:26 -0800 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2021-01-04 12:21:30 +0000 |
commit | 6695ad5da06a6fbdefffda647f192a784a275315 (patch) | |
tree | ed63f188adb5e06f60905496db3f29921bfb5b07 | |
parent | bf7f25b1758fc9c6bc42a20bd690a5d64b27ccf3 (diff) | |
download | arm-optimized-routines-6695ad5da06a6fbdefffda647f192a784a275315.tar.gz |
string: Reduce alignment in strncmp
There were nops before the beginning of the function to place
the main loop on a 64-byte boundary, but the addition of BTI
and instructions for ILP32 has corrupted that.
As per review, drop 64-byte alignment entirely, and use the
default 16-byte alignment from ENTRY.
-rw-r--r-- | string/aarch64/strncmp-mte.S | 12 | ||||
-rw-r--r-- | string/aarch64/strncmp.S | 12 |
2 files changed, 8 insertions, 16 deletions
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S index 42f1a3b..c6dbe0b 100644 --- a/string/aarch64/strncmp-mte.S +++ b/string/aarch64/strncmp-mte.S @@ -54,11 +54,7 @@ #endif .text - .p2align 6 - .rep 9 - nop /* Pad so that the loop below fits a cache line. */ - .endr -ENTRY_ALIGN (__strncmp_aarch64_mte, 0) +ENTRY (__strncmp_aarch64_mte) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) @@ -73,7 +69,7 @@ ENTRY_ALIGN (__strncmp_aarch64_mte, 0) /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ - /* Start of performance-critical section -- one 64B cache line. */ + .p2align 4 L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 @@ -86,7 +82,7 @@ L(start_realigned): bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ + /* End of main loop */ L(full_check): #ifndef __AARCH64EB__ @@ -178,7 +174,7 @@ L(mutual_align): orr data2, data2, tmp2 b L(start_realigned) - .p2align 6 + .p2align 4 /* Don't bother with dwords for up to 16 bytes. */ L(misaligned8): cmp limit, #16 diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index b8824b8..52f2396 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -41,11 +41,7 @@ #define count mask .text - .p2align 6 - .rep 6 - nop /* Pad so that the loop below fits a cache line. */ - .endr -ENTRY_ALIGN (__strncmp_aarch64, 0) +ENTRY (__strncmp_aarch64) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) @@ -63,7 +59,7 @@ ENTRY_ALIGN (__strncmp_aarch64, 0) /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ - /* Start of performance-critical section -- one 64B cache line. */ + .p2align 4 L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 @@ -76,7 +72,7 @@ L(start_realigned): bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ + /* End of main loop */ /* Not reached the limit, must have found the end or a diff. */ tbz limit_wd, #63, L(not_limit) @@ -181,7 +177,7 @@ L(mutual_align): add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) - .p2align 6 + .p2align 4 /* Don't bother with dwords for up to 16 bytes. */ L(misaligned8): cmp limit, #16 |