aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2023-01-10 14:27:39 +0000
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2023-01-10 14:27:39 +0000
commit7c1d7a24c9e2ea5ba846a6311d24df2eb0154ea0 (patch)
tree6b613dc3da96eb78ed2fe02c8d472197bb61b811
parent10589b2c95e4d482f09ac6a705918fdc32c8421a (diff)
downloadarm-optimized-routines-7c1d7a24c9e2ea5ba846a6311d24df2eb0154ea0.tar.gz
string: Optimize strcpy
Optimize strcpy main loop - large strings are ~22% faster.
-rw-r--r--string/aarch64/strcpy.S31
1 files changed, 17 insertions, 14 deletions
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 9aca330..470a865 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -84,13 +84,10 @@ ENTRY (STRCPY)
IFSTPCPY (add result, dstin, len)
ret
- .p2align 4,,8
L(tail):
rbit synd, synd
clz len, synd
lsr len, len, 2
-
- .p2align 4
L(less16):
tbz len, 3, L(less8)
sub tmp, len, 7
@@ -123,31 +120,37 @@ L(zerobyte):
.p2align 4
L(start_loop):
- sub len, src, srcin
+ sub tmp, srcin, dstin
ldr dataq2, [srcin]
- add dst, dstin, len
+ sub dst, src, tmp
str dataq2, [dstin]
-
- .p2align 5
L(loop):
- str dataq, [dst], 16
- ldr dataq, [src, 16]!
+ str dataq, [dst], 32
+ ldr dataq, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loopend)
+ str dataq, [dst, -16]
+ ldr dataq, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
-
+ add dst, dst, 16
+L(loopend):
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
+ sub dst, dst, 31
#ifndef __AARCH64EB__
rbit synd, synd
#endif
clz len, synd
lsr len, len, 2
- sub tmp, len, 15
- ldr dataq, [src, tmp]
- str dataq, [dst, tmp]
- IFSTPCPY (add result, dst, len)
+ add dst, dst, len
+ ldr dataq, [dst, tmp]
+ str dataq, [dst]
+ IFSTPCPY (add result, dst, 15)
ret
END (STRCPY)