From 4c175c8b8e124961beb22b8f034fe16e8d3798d0 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Wed, 12 Feb 2020 12:22:37 +0000
Subject: string: optimize memcpy

Further optimize integer memcpy. Small cases now include copies up
to 32 bytes. 64-128 byte copies are split into two cases to improve
performance of 64-96 byte copies. Comments have been rewritten.

Improves glibc's memcpy-random benchmark by ~10% on Neoverse N1.
---
 string/aarch64/memcpy.S | 148 +++++++++++++++++++++---------------------------
 1 file changed, 66 insertions(+), 82 deletions(-)

(limited to 'string/aarch64/memcpy.S')

diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index 1aad88e..5e6bf22 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,7 +1,7 @@
 /*
  * memcpy - copy memory area
  *
- * Copyright (c) 2012-2019, Arm Limited.
+ * Copyright (c) 2012-2020, Arm Limited.
  * SPDX-License-Identifier: MIT
  */
 
@@ -22,11 +22,11 @@
 #define A_l	x6
 #define A_lw	w6
 #define A_h	x7
-#define A_hw	w7
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
 #define C_l	x10
+#define C_lw	w10
 #define C_h	x11
 #define D_l	x12
 #define D_h	x13
@@ -40,119 +40,114 @@
 #define H_h	srcend
 #define tmp1	x14
 
-/* This implementation of memcpy correctly handles overlaps, therefore
-   __memmove_aarch64 aliases to __memcpy_aarch64. By moving the src and
-   dst buffer overlap check from the start of memmove code to the
-   beginning of large copy code, the overhead of combining memcpy
-   and memmove implementations is negligible.
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
 
-   Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..128 bytes which are fully unrolled, and large
-   copies (moves).
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
 
-   Large forward moves align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-
-   Large backward moves align dstend and use an unrolled loop processing
-   64 bytes per iteration.
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
 */
 
 ENTRY (__memcpy_aarch64)
 ENTRY_ALIAS (__memmove_aarch64)
 	add	srcend, src, count
 	add	dstend, dstin, count
-	cmp	count, 16
-	b.ls	L(copy16)
-	cmp	count, 128
-	b.hi	L(move_long)
+	cmp     count, 128
+	b.hi    L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
 
-	/* Medium copies: 17..128 bytes.  */
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
 	ldp	A_l, A_h, [src]
 	ldp	D_l, D_h, [srcend, -16]
-	cmp	count, 32
-	b.hi	L(copy33_128)
 	stp	A_l, A_h, [dstin]
 	stp	D_l, D_h, [dstend, -16]
 	ret
 
-	.p2align 4
-	/* Small copies: 0..16 bytes.  */
+	/* Copy 8-15 bytes.  */
 L(copy16):
-	/* 8-15 bytes.  */
-	cmp	count, 8
-	b.lo	1f
+	tbz	count, 3, L(copy8)
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
 
-	.p2align 4
-1:
-	/* 4-7 bytes.  */
-	tbz	count, 2, 1f
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
 	ldr	A_lw, [src]
-	ldr	A_hw, [srcend, -4]
+	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
-	str	A_hw, [dstend, -4]
+	str	B_lw, [dstend, -4]
 	ret
 
-	.p2align 4
-	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-	cbz	count, 2f
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
 	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
-	ldrb	A_hw, [srcend, -1]
+	ldrb	C_lw, [srcend, -1]
 	ldrb	B_lw, [src, tmp1]
 	strb	A_lw, [dstin]
 	strb	B_lw, [dstin, tmp1]
-	strb	A_hw, [dstend, -1]
-2:	ret
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
 
 	.p2align 4
-	/* Copy 33..128 bytes.  */
-L(copy33_128):
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	cmp	count, 64
-	b.hi	L(copy65_128)
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp     A_l, A_h, [src]
+	ldp     B_l, B_h, [src, 16]
+	ldp     C_l, C_h, [srcend, -32]
+	ldp     D_l, D_h, [srcend, -16]
+	cmp     count, 64
+	b.hi    L(copy128)
+	stp     A_l, A_h, [dstin]
+	stp     B_l, B_h, [dstin, 16]
+	stp     C_l, C_h, [dstend, -32]
+	stp     D_l, D_h, [dstend, -16]
 	ret
 
 	.p2align 4
 	/* Copy 65..128 bytes.  */
-L(copy65_128):
+L(copy128):
 	ldp	E_l, E_h, [src, 32]
 	ldp	F_l, F_h, [src, 48]
+	cmp	count, 96
+	b.ls	L(copy96)
 	ldp	G_l, G_h, [srcend, -64]
 	ldp	H_l, H_h, [srcend, -48]
+	stp     G_l, G_h, [dstend, -64]
+	stp     H_l, H_h, [dstend, -48]
+L(copy96):
 	stp	A_l, A_h, [dstin]
+	stp     B_l, B_h, [dstin, 16]
+	stp     E_l, E_h, [dstin, 32]
+	stp     F_l, F_h, [dstin, 48]
+	stp     C_l, C_h, [dstend, -32]
 	stp	D_l, D_h, [dstend, -16]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-	stp	E_l, E_h, [dstin, 32]
-	stp	F_l, F_h, [dstin, 48]
-	stp	G_l, G_h, [dstend, -64]
-	stp	H_l, H_h, [dstend, -48]
 	ret
 
 	.p2align 4
-	/* Move more than 128 bytes.  */
-L(move_long):
-	sub	tmp1, dstin, src	/* Overlap check.  */
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
 	cbz	tmp1, L(copy0)
 	cmp	tmp1, count
-	b.lo	L(move_long_backwards)
+	b.lo	L(copy_long_backwards)
 
-	/* Align dst to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 128 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
 
 	ldp	D_l, D_h, [src]
 	and	tmp1, dstin, 15
@@ -179,9 +174,7 @@ L(loop64):
 	subs	count, count, 64
 	b.hi	L(loop64)
 
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the end even if
-	   there is just 1 byte left.  */
+	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
 	ldp	E_l, E_h, [srcend, -64]
 	stp	A_l, A_h, [dst, 16]
@@ -195,20 +188,13 @@ L(copy64_from_end):
 	stp	A_l, A_h, [dstend, -48]
 	stp	B_l, B_h, [dstend, -32]
 	stp	C_l, C_h, [dstend, -16]
-
-L(copy0):
 	ret
 
 	.p2align 4
 
-	/* Move more than 128 bytes where src and dst buffers overlap
-	   and dst > src.
-
-     Align dstend to 16 byte alignment so that we don't cross cache line
-	   boundaries on both loads and stores.  There are at least 128 bytes
-	   to copy, so copy 16 bytes unaligned and then align.  The loop
-	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
-L(move_long_backwards):
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
 	ldp	D_l, D_h, [srcend, -16]
 	and	tmp1, dstend, 15
 	sub	srcend, srcend, tmp1
@@ -234,9 +220,7 @@ L(loop64_backwards):
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
-	/* Write the last full set of 64 bytes.  The remainder is at most 64
-	   bytes, so it is safe to always copy 64 bytes from the start even if
-	   there is just 1 byte left.  */
+	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
 	ldp	G_l, G_h, [src, 48]
 	stp	A_l, A_h, [dstend, -16]
-- 
cgit v1.2.3