aboutsummaryrefslogtreecommitdiff
path: root/string
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2022-08-22 13:20:47 +0100
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2022-08-23 10:09:05 +0100
commitf890e426fd2607c78b39c04a795c0e486541a108 (patch)
treea37c9d2d037c7fdbae41b515734b6a005cf95a81 /string
parent5de06730073d93487fdda678db08e9e2cafe93bb (diff)
downloadarm-optimized-routines-f890e426fd2607c78b39c04a795c0e486541a108.tar.gz
string: Optimize strlen-mte
Optimize strlen by unrolling the main loop. Large strings are 64% faster.
Diffstat (limited to 'string')
-rw-r--r--string/aarch64/strlen-mte.S21
1 files changed, 12 insertions, 9 deletions
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 0d33ebb..fdb07ae 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -28,13 +28,9 @@
#define dend d2
/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
ENTRY (__strlen_aarch64_mte)
PTR_ARG (0)
@@ -54,18 +50,25 @@ ENTRY (__strlen_aarch64_mte)
.p2align 5
L(loop):
- ldr data, [src, 16]!
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loop_end)
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
-
+ sub src, src, 16
+L(loop_end):
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
sub result, src, srcin
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
+ add result, result, 16
clz tmp, synd
add result, result, tmp, lsr 2
ret