diff options
author | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2022-08-22 13:20:47 +0100 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2022-08-23 10:09:05 +0100 |
commit | f890e426fd2607c78b39c04a795c0e486541a108 (patch) | |
tree | a37c9d2d037c7fdbae41b515734b6a005cf95a81 /string | |
parent | 5de06730073d93487fdda678db08e9e2cafe93bb (diff) | |
download | arm-optimized-routines-f890e426fd2607c78b39c04a795c0e486541a108.tar.gz |
string: Optimize strlen-mte
Optimize strlen by unrolling the main loop. Large strings are 64% faster.
Diffstat (limited to 'string')
-rw-r--r-- | string/aarch64/strlen-mte.S | 21 |
1 files changed, 12 insertions, 9 deletions
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S index 0d33ebb..fdb07ae 100644 --- a/string/aarch64/strlen-mte.S +++ b/string/aarch64/strlen-mte.S @@ -28,13 +28,9 @@ #define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strlen_aarch64_mte) PTR_ARG (0) @@ -54,18 +50,25 @@ ENTRY (__strlen_aarch64_mte) .p2align 5 L(loop): - ldr data, [src, 16]! + ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop_end) + ldr data, [src, 32]! cmeq vhas_nul.16b, vdata.16b, 0 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop) - + sub src, src, 16 +L(loop_end): shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ sub result, src, srcin fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif + add result, result, 16 clz tmp, synd add result, result, tmp, lsr 2 ret |