diff options
author | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2023-01-10 14:27:39 +0000 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2023-01-10 14:27:39 +0000 |
commit | 7c1d7a24c9e2ea5ba846a6311d24df2eb0154ea0 (patch) | |
tree | 6b613dc3da96eb78ed2fe02c8d472197bb61b811 | |
parent | 10589b2c95e4d482f09ac6a705918fdc32c8421a (diff) | |
download | arm-optimized-routines-7c1d7a24c9e2ea5ba846a6311d24df2eb0154ea0.tar.gz |
string: Optimize strcpy
Optimize strcpy main loop - large strings are ~22% faster.
-rw-r--r-- | string/aarch64/strcpy.S | 31 |
1 files changed, 17 insertions, 14 deletions
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index 9aca330..470a865 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -84,13 +84,10 @@ ENTRY (STRCPY) IFSTPCPY (add result, dstin, len) ret - .p2align 4,,8 L(tail): rbit synd, synd clz len, synd lsr len, len, 2 - - .p2align 4 L(less16): tbz len, 3, L(less8) sub tmp, len, 7 @@ -123,31 +120,37 @@ L(zerobyte): .p2align 4 L(start_loop): - sub len, src, srcin + sub tmp, srcin, dstin ldr dataq2, [srcin] - add dst, dstin, len + sub dst, src, tmp str dataq2, [dstin] - - .p2align 5 L(loop): - str dataq, [dst], 16 - ldr dataq, [src, 16]! + str dataq, [dst], 32 + ldr dataq, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loopend) + str dataq, [dst, -16] + ldr dataq, [src, 32]! cmeq vhas_nul.16b, vdata.16b, 0 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop) - + add dst, dst, 16 +L(loopend): shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend + sub dst, dst, 31 #ifndef __AARCH64EB__ rbit synd, synd #endif clz len, synd lsr len, len, 2 - sub tmp, len, 15 - ldr dataq, [src, tmp] - str dataq, [dst, tmp] - IFSTPCPY (add result, dst, len) + add dst, dst, len + ldr dataq, [dst, tmp] + str dataq, [dst] + IFSTPCPY (add result, dst, 15) ret END (STRCPY) |