aboutsummaryrefslogtreecommitdiff
path: root/string/aarch64/memcpy.S
diff options
context:
space:
mode:
authorWilco Dijkstra <wdijkstr@arm.com>2020-02-12 12:22:37 +0000
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2020-02-12 12:26:23 +0000
commit4c175c8b8e124961beb22b8f034fe16e8d3798d0 (patch)
tree12c49ca596f36558ac4104f8f551bf2c305b66ff /string/aarch64/memcpy.S
parent33ba19089a261964e1e84ba4edf90263b468c161 (diff)
downloadarm-optimized-routines-4c175c8b8e124961beb22b8f034fe16e8d3798d0.tar.gz
string: optimize memcpy
Further optimize integer memcpy. Small cases now include copies up to 32 bytes. 64-128 byte copies are split into two cases to improve performance of 64-96 byte copies. Comments have been rewritten. Improves glibc's memcpy-random benchmark by ~10% on Neoverse N1.
Diffstat (limited to 'string/aarch64/memcpy.S')
-rw-r--r--string/aarch64/memcpy.S148
1 files changed, 66 insertions, 82 deletions
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 1aad88e..5e6bf22 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,7 +1,7 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2012-2019, Arm Limited.
+ * Copyright (c) 2012-2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
@@ -22,11 +22,11 @@
#define A_l x6
#define A_lw w6
#define A_h x7
-#define A_hw w7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
+#define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
@@ -40,119 +40,114 @@
#define H_h srcend
#define tmp1 x14
-/* This implementation of memcpy correctly handles overlaps, therefore
- __memmove_aarch64 aliases to __memcpy_aarch64. By moving the src and
- dst buffer overlap check from the start of memmove code to the
- beginning of large copy code, the overhead of combining memcpy
- and memmove implementations is negligible.
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
- Copies are split into 3 main cases: small copies of up to 16 bytes,
- medium copies of 17..128 bytes which are fully unrolled, and large
- copies (moves).
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
- Large forward moves align the destination and use an unrolled loop
- processing 64 bytes per iteration.
-
- Large backward moves align dstend and use an unrolled loop processing
- 64 bytes per iteration.
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
*/
ENTRY (__memcpy_aarch64)
ENTRY_ALIAS (__memmove_aarch64)
add srcend, src, count
add dstend, dstin, count
- cmp count, 16
- b.ls L(copy16)
- cmp count, 128
- b.hi L(move_long)
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
- /* Medium copies: 17..128 bytes. */
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
ldp A_l, A_h, [src]
ldp D_l, D_h, [srcend, -16]
- cmp count, 32
- b.hi L(copy33_128)
stp A_l, A_h, [dstin]
stp D_l, D_h, [dstend, -16]
ret
- .p2align 4
- /* Small copies: 0..16 bytes. */
+ /* Copy 8-15 bytes. */
L(copy16):
- /* 8-15 bytes. */
- cmp count, 8
- b.lo 1f
+ tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
- .p2align 4
-1:
- /* 4-7 bytes. */
- tbz count, 2, 1f
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
+ ldr B_lw, [srcend, -4]
str A_lw, [dstin]
- str A_hw, [dstend, -4]
+ str B_lw, [dstend, -4]
ret
- .p2align 4
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-1:
- cbz count, 2f
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
+ ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-2: ret
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
.p2align 4
- /* Copy 33..128 bytes. */
-L(copy33_128):
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- cmp count, 64
- b.hi L(copy65_128)
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Copy 65..128 bytes. */
-L(copy65_128):
+L(copy128):
ldp E_l, E_h, [src, 32]
ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
ldp G_l, G_h, [srcend, -64]
ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
- stp E_l, E_h, [dstin, 32]
- stp F_l, F_h, [dstin, 48]
- stp G_l, G_h, [dstend, -64]
- stp H_l, H_h, [dstend, -48]
ret
.p2align 4
- /* Move more than 128 bytes. */
-L(move_long):
- sub tmp1, dstin, src /* Overlap check. */
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
cbz tmp1, L(copy0)
cmp tmp1, count
- b.lo L(move_long_backwards)
+ b.lo L(copy_long_backwards)
- /* Align dst to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 128 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
ldp D_l, D_h, [src]
and tmp1, dstin, 15
@@ -179,9 +174,7 @@ L(loop64):
subs count, count, 64
b.hi L(loop64)
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the end even if
- there is just 1 byte left. */
+ /* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
@@ -195,20 +188,13 @@ L(copy64_from_end):
stp A_l, A_h, [dstend, -48]
stp B_l, B_h, [dstend, -32]
stp C_l, C_h, [dstend, -16]
-
-L(copy0):
ret
.p2align 4
- /* Move more than 128 bytes where src and dst buffers overlap
- and dst > src.
-
- Align dstend to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 128 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-L(move_long_backwards):
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
ldp D_l, D_h, [srcend, -16]
and tmp1, dstend, 15
sub srcend, srcend, tmp1
@@ -234,9 +220,7 @@ L(loop64_backwards):
subs count, count, 64
b.hi L(loop64_backwards)
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
+ /* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend, -16]