From 4c175c8b8e124961beb22b8f034fe16e8d3798d0 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Wed, 12 Feb 2020 12:22:37 +0000 Subject: string: optimize memcpy Further optimize integer memcpy. Small cases now include copies up to 32 bytes. 64-128 byte copies are split into two cases to improve performance of 64-96 byte copies. Comments have been rewritten. Improves glibc's memcpy-random benchmark by ~10% on Neoverse N1. --- string/aarch64/memcpy.S | 148 +++++++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 82 deletions(-) (limited to 'string/aarch64/memcpy.S') diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S index 1aad88e..5e6bf22 100644 --- a/string/aarch64/memcpy.S +++ b/string/aarch64/memcpy.S @@ -1,7 +1,7 @@ /* * memcpy - copy memory area * - * Copyright (c) 2012-2019, Arm Limited. + * Copyright (c) 2012-2020, Arm Limited. * SPDX-License-Identifier: MIT */ @@ -22,11 +22,11 @@ #define A_l x6 #define A_lw w6 #define A_h x7 -#define A_hw w7 #define B_l x8 #define B_lw w8 #define B_h x9 #define C_l x10 +#define C_lw w10 #define C_h x11 #define D_l x12 #define D_h x13 @@ -40,119 +40,114 @@ #define H_h srcend #define tmp1 x14 -/* This implementation of memcpy correctly handles overlaps, therefore - __memmove_aarch64 aliases to __memcpy_aarch64. By moving the src and - dst buffer overlap check from the start of memmove code to the - beginning of large copy code, the overhead of combining memcpy - and memmove implementations is negligible. +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. - Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..128 bytes which are fully unrolled, and large - copies (moves). + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. - Large forward moves align the destination and use an unrolled loop - processing 64 bytes per iteration. - - Large backward moves align dstend and use an unrolled loop processing - 64 bytes per iteration. + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. */ ENTRY (__memcpy_aarch64) ENTRY_ALIAS (__memmove_aarch64) add srcend, src, count add dstend, dstin, count - cmp count, 16 - b.ls L(copy16) - cmp count, 128 - b.hi L(move_long) + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) - /* Medium copies: 17..128 bytes. */ + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) ldp A_l, A_h, [src] ldp D_l, D_h, [srcend, -16] - cmp count, 32 - b.hi L(copy33_128) stp A_l, A_h, [dstin] stp D_l, D_h, [dstend, -16] ret - .p2align 4 - /* Small copies: 0..16 bytes. */ + /* Copy 8-15 bytes. */ L(copy16): - /* 8-15 bytes. */ - cmp count, 8 - b.lo 1f + tbz count, 3, L(copy8) ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret - .p2align 4 -1: - /* 4-7 bytes. */ - tbz count, 2, 1f + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) ldr A_lw, [src] - ldr A_hw, [srcend, -4] + ldr B_lw, [srcend, -4] str A_lw, [dstin] - str A_hw, [dstend, -4] + str B_lw, [dstend, -4] ret - .p2align 4 - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) lsr tmp1, count, 1 ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] + ldrb C_lw, [srcend, -1] ldrb B_lw, [src, tmp1] strb A_lw, [dstin] strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret + strb C_lw, [dstend, -1] +L(copy0): + ret .p2align 4 - /* Copy 33..128 bytes. */ -L(copy33_128): - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - cmp count, 64 - b.hi L(copy65_128) - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] ret .p2align 4 /* Copy 65..128 bytes. */ -L(copy65_128): +L(copy128): ldp E_l, E_h, [src, 32] ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) ldp G_l, G_h, [srcend, -64] ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] stp D_l, D_h, [dstend, -16] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] - stp E_l, E_h, [dstin, 32] - stp F_l, F_h, [dstin, 48] - stp G_l, G_h, [dstend, -64] - stp H_l, H_h, [dstend, -48] ret .p2align 4 - /* Move more than 128 bytes. */ -L(move_long): - sub tmp1, dstin, src /* Overlap check. */ + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src cbz tmp1, L(copy0) cmp tmp1, count - b.lo L(move_long_backwards) + b.lo L(copy_long_backwards) - /* Align dst to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ + /* Copy 16 bytes and then align dst to 16-byte alignment. */ ldp D_l, D_h, [src] and tmp1, dstin, 15 @@ -179,9 +174,7 @@ L(loop64): subs count, count, 64 b.hi L(loop64) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ + /* Write the last iteration and copy 64 bytes from the end. */ L(copy64_from_end): ldp E_l, E_h, [srcend, -64] stp A_l, A_h, [dst, 16] @@ -195,20 +188,13 @@ L(copy64_from_end): stp A_l, A_h, [dstend, -48] stp B_l, B_h, [dstend, -32] stp C_l, C_h, [dstend, -16] - -L(copy0): ret .p2align 4 - /* Move more than 128 bytes where src and dst buffers overlap - and dst > src. - - Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 128 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ -L(move_long_backwards): + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): ldp D_l, D_h, [srcend, -16] and tmp1, dstend, 15 sub srcend, srcend, tmp1 @@ -234,9 +220,7 @@ L(loop64_backwards): subs count, count, 64 b.hi L(loop64_backwards) - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ + /* Write the last iteration and copy 64 bytes from the start. */ L(copy64_from_start): ldp G_l, G_h, [src, 48] stp A_l, A_h, [dstend, -16] -- cgit v1.2.3