diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 00:57:30 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 00:57:30 +0000 |
commit | 0edd6499aaed16bf45de92bb0ad1c729486ce6f4 (patch) | |
tree | b6182e391304fb3a42c51d482dcf671f540f2363 /string/aarch64 | |
parent | f2e7d2de0fe4c2bddb59992ba401391f38627a1e (diff) | |
parent | 172d24a7ae67ee7bae413d5a8618f1b5edc002be (diff) | |
download | arm-optimized-routines-android14-mainline-cellbroadcast-release.tar.gz |
Snap for 10447354 from 172d24a7ae67ee7bae413d5a8618f1b5edc002be to mainline-cellbroadcast-releaseaml_cbr_341810000aml_cbr_341710000aml_cbr_341610000aml_cbr_341510010aml_cbr_341410010aml_cbr_341311010aml_cbr_341110000aml_cbr_341011000aml_cbr_340914000android14-mainline-cellbroadcast-releaseaml_cbr_341810000
Change-Id: I8753ae14d61308952964b5f87c7e48044f60727c
Diffstat (limited to 'string/aarch64')
40 files changed, 1050 insertions, 1510 deletions
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S index 84339f7..207e229 100644 --- a/string/aarch64/__mtag_tag_region.S +++ b/string/aarch64/__mtag_tag_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_region - tag memory * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S index f58364c..44b8e01 100644 --- a/string/aarch64/__mtag_tag_zero_region.S +++ b/string/aarch64/__mtag_tag_zero_region.S @@ -1,8 +1,8 @@ /* * __mtag_tag_zero_region - tag memory and fill it with zero bytes * - * Copyright (c) 2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2021-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -15,7 +15,7 @@ * The memory region may remain untagged if tagging is not enabled. */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_MEMORY_TAGGING diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h new file mode 100644 index 0000000..069b146 --- /dev/null +++ b/string/aarch64/asmdefs.h @@ -0,0 +1,92 @@ +/* + * Macros for asm code. AArch64 version. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; \ + BTI_C; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#endif + +#ifdef __ILP32__ + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n +#else +#define SIZE_ARG(n) +#endif + +/* Compiler supports SVE instructions */ +#ifndef HAVE_SVE +# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5) +# define HAVE_SVE 1 +# else +# define HAVE_SVE 0 +# endif +#endif + +#endif diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S index 5a54242..131b7fa 100644 --- a/string/aarch64/check-arch.S +++ b/string/aarch64/check-arch.S @@ -1,8 +1,8 @@ /* * check ARCH setting. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #if !__aarch64__ @@ -10,4 +10,4 @@ #endif /* Include for GNU property notes. */ -#include "../asmdefs.h" +#include "asmdefs.h" diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S index c2e967d..948c3cb 100644 --- a/string/aarch64/memchr-mte.S +++ b/string/aarch64/memchr-mte.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,25 +23,21 @@ #define synd x5 #define shift x6 #define tmp x7 -#define wtmp w7 #define vrepchr v0 #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vrepmask v3 -#define vend v4 -#define dend d4 +#define vend v3 +#define dend d3 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__memchr_aarch64_mte) PTR_ARG (0) @@ -50,55 +46,53 @@ ENTRY (__memchr_aarch64_mte) cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin - mov wtmp, 0xf00f - dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b lsl shift, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) rbit synd, synd clz synd, synd - add result, srcin, synd, lsr 2 cmp cntin, synd, lsr 2 + add result, srcin, synd, lsr 2 csel result, result, xzr, hi ret + .p2align 3 L(start_loop): sub tmp, src, srcin - add tmp, tmp, 16 + add tmp, tmp, 17 subs cntrem, cntin, tmp - b.ls L(nomatch) + b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) - + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 .p2align 4 L(loop32): - ldr qdata, [src, 16]! + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, 16]! - subs cntrem, cntrem, 32 + ldr qdata, [src, 16] cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.ls L(end) + subs cntrem, cntrem, 32 + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) +L(end_2): + add src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + sub cntrem, src, srcin fmov synd, dend - add tmp, srcin, cntin - sub cntrem, tmp, src + sub cntrem, cntin, cntrem #ifndef __AARCH64EB__ rbit synd, synd #endif diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S index c22e659..b851cf3 100644 --- a/string/aarch64/memchr-sve.S +++ b/string/aarch64/memchr-sve.S @@ -1,11 +1,11 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S index 353f0d1..fe6cfe2 100644 --- a/string/aarch64/memchr.S +++ b/string/aarch64/memchr.S @@ -1,8 +1,8 @@ /* * memchr - find a character in a memory zone * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S index 78c5eca..d52ce45 100644 --- a/string/aarch64/memcmp-sve.S +++ b/string/aarch64/memcmp-sve.S @@ -1,11 +1,11 @@ /* * memcmp - compare memory * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S index 3b10266..35135e7 100644 --- a/string/aarch64/memcmp.S +++ b/string/aarch64/memcmp.S @@ -1,103 +1,84 @@ /* memcmp - compare memory * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. */ -#include "../asmdefs.h" +#include "asmdefs.h" -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result w0 +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 + +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define data3 x5 +#define data3w w5 +#define data4 x6 +#define data4w w6 +#define tmp x6 +#define src1end x7 +#define src2end x8 -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data1h x4 -#define data2 x5 -#define data2w w5 -#define data2h x6 -#define tmp1 x7 -#define tmp2 x8 ENTRY (__memcmp_aarch64) PTR_ARG (0) PTR_ARG (1) SIZE_ARG (2) - subs limit, limit, 8 - b.lo L(less8) - - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - b.ne L(return) - - subs limit, limit, 8 - b.gt L(more16) - ldr data1, [src1, limit] - ldr data2, [src2, limit] - b L(return) - -L(more16): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - bne L(return) - - /* Jump directly to comparing the last 16 bytes for 32 byte (or less) - strings. */ - subs limit, limit, 16 + cmp limit, 16 + b.lo L(less16) + ldp data1, data3, [src1] + ldp data2, data4, [src2] + ccmp data1, data2, 0, ne + ccmp data3, data4, 0, eq + b.ne L(return2) + + add src1end, src1, limit + add src2end, src2, limit + cmp limit, 32 b.ls L(last_bytes) + cmp limit, 160 + b.hs L(loop_align) + sub limit, limit, 32 - /* We overlap loads between 0-32 bytes at either side of SRC1 when we - try to align, so limit it only to strings larger than 128 bytes. */ - cmp limit, 96 - b.ls L(loop16) - - /* Align src1 and adjust src2 with bytes not yet done. */ - and tmp1, src1, 15 - add limit, limit, tmp1 - sub src1, src1, tmp1 - sub src2, src2, tmp1 - - /* Loop performing 16 bytes per iteration using aligned src1. - Limit is pre-decremented by 16 and must be larger than zero. - Exit if <= 16 bytes left to do or if the data is not equal. */ .p2align 4 -L(loop16): - ldp data1, data1h, [src1], 16 - ldp data2, data2h, [src2], 16 - subs limit, limit, 16 - ccmp data1, data2, 0, hi - ccmp data1h, data2h, 0, eq - b.eq L(loop16) - +L(loop32): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ccmp data3, data4, 0, eq + b.ne L(return2) + cmp limit, 16 + b.ls L(last_bytes) + + ldp data1, data3, [src1, 32] + ldp data2, data4, [src2, 32] cmp data1, data2 - bne L(return) + ccmp data3, data4, 0, eq + b.ne L(return2) + add src1, src1, 32 + add src2, src2, 32 +L(last64): + subs limit, limit, 32 + b.hi L(loop32) /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - add src1, src1, limit - add src2, src2, limit - ldp data1, data1h, [src1] - ldp data2, data2h, [src2] - cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h + ldp data1, data3, [src1end, -16] + ldp data2, data4, [src2end, -16] +L(return2): cmp data1, data2 + csel data1, data1, data3, ne + csel data2, data2, data4, ne /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): @@ -105,33 +86,105 @@ L(return): rev data1, data1 rev data2, data2 #endif - cmp data1, data2 -L(ret_eq): + cmp data1, data2 cset result, ne cneg result, result, lo ret .p2align 4 - /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less16): + add src1end, src1, limit + add src2end, src2, limit + tbz limit, 3, L(less8) + ldr data1, [src1] + ldr data2, [src2] + ldr data3, [src1end, -8] + ldr data4, [src2end, -8] + b L(return2) + + .p2align 4 L(less8): - adds limit, limit, 4 - b.lo L(less4) - ldr data1w, [src1], 4 - ldr data2w, [src2], 4 + tbz limit, 2, L(less4) + ldr data1w, [src1] + ldr data2w, [src2] + ldr data3w, [src1end, -4] + ldr data4w, [src2end, -4] + b L(return2) + +L(less4): + tbz limit, 1, L(less2) + ldrh data1w, [src1] + ldrh data2w, [src2] cmp data1w, data2w b.ne L(return) - sub limit, limit, 4 -L(less4): - adds limit, limit, 4 - beq L(ret_eq) -L(byte_loop): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - subs limit, limit, 1 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.eq L(byte_loop) +L(less2): + mov result, 0 + tbz limit, 0, L(return_zero) + ldrb data1w, [src1end, -1] + ldrb data2w, [src2end, -1] sub result, data1w, data2w +L(return_zero): ret -END (__memcmp_aarch64) +L(loop_align): + ldp data1, data3, [src1, 16] + ldp data2, data4, [src2, 16] + cmp data1, data2 + ccmp data3, data4, 0, eq + b.ne L(return2) + + /* Align src2 and adjust src1, src2 and limit. */ + and tmp, src2, 15 + sub tmp, tmp, 16 + sub src2, src2, tmp + add limit, limit, tmp + sub src1, src1, tmp + sub limit, limit, 64 + 16 + + .p2align 4 +L(loop64): + ldr q0, [src1, 16] + ldr q1, [src2, 16] + subs limit, limit, 64 + ldr q2, [src1, 32] + ldr q3, [src2, 32] + eor v0.16b, v0.16b, v1.16b + eor v1.16b, v2.16b, v3.16b + ldr q2, [src1, 48] + ldr q3, [src2, 48] + umaxp v0.16b, v0.16b, v1.16b + ldr q4, [src1, 64]! + ldr q5, [src2, 64]! + eor v1.16b, v2.16b, v3.16b + eor v2.16b, v4.16b, v5.16b + umaxp v1.16b, v1.16b, v2.16b + umaxp v0.16b, v0.16b, v1.16b + umaxp v0.16b, v0.16b, v0.16b + fmov tmp, d0 + ccmp tmp, 0, 0, hi + b.eq L(loop64) + + /* If equal, process last 1-64 bytes using scalar loop. */ + add limit, limit, 64 + 16 + cbz tmp, L(last64) + + /* Determine the 8-byte aligned offset of the first difference. */ +#ifdef __AARCH64EB__ + rev16 tmp, tmp +#endif + rev tmp, tmp + clz tmp, tmp + bic tmp, tmp, 7 + sub tmp, tmp, 48 + ldr data1, [src1, tmp] + ldr data2, [src2, tmp] +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + mov result, 1 + cmp data1, data2 + cneg result, result, lo + ret +END (__memcmp_aarch64) diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S index f97f2c3..e6527d0 100644 --- a/string/aarch64/memcpy-advsimd.S +++ b/string/aarch64/memcpy-advsimd.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S new file mode 100644 index 0000000..e8a946d --- /dev/null +++ b/string/aarch64/memcpy-sve.S @@ -0,0 +1,177 @@ +/* + * memcpy - copy memory area + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * + */ + +#include "asmdefs.h" + +#ifdef HAVE_SVE + +.arch armv8-a+sve + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + SVE vectors are used to speedup small copies. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_sve) +ENTRY (__memcpy_aarch64_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + cmp count, 128 + b.hi L(copy_long) + cntb vlen + cmp count, vlen, lsl 1 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(return) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memcpy_aarch64_sve) + +#endif diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S index dd254f6..7c0606e 100644 --- a/string/aarch64/memcpy.S +++ b/string/aarch64/memcpy.S @@ -1,8 +1,8 @@ /* * memcpy - copy memory area * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define src x1 diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S index 7b4be84..6418bdf 100644 --- a/string/aarch64/memrchr.S +++ b/string/aarch64/memrchr.S @@ -1,8 +1,8 @@ /* * memrchr - find last character in a memory zone. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -23,7 +23,6 @@ #define synd x5 #define shift x6 #define tmp x7 -#define wtmp w7 #define end x8 #define endm1 x9 @@ -31,19 +30,16 @@ #define qdata q1 #define vdata v1 #define vhas_chr v2 -#define vrepmask v3 -#define vend v4 -#define dend d4 +#define vend v3 +#define dend d3 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__memrchr_aarch64) PTR_ARG (0) @@ -53,12 +49,9 @@ ENTRY (__memrchr_aarch64) cbz cntin, L(nomatch) ld1 {vdata.16b}, [src] dup vrepchr.16b, chrin - mov wtmp, 0xf00f - dup vrepmask.8h, wtmp cmeq vhas_chr.16b, vdata.16b, vrepchr.16b neg shift, end, lsl 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsl synd, synd, shift cbz synd, L(start_loop) @@ -69,34 +62,36 @@ ENTRY (__memrchr_aarch64) csel result, result, xzr, hi ret + nop L(start_loop): - sub tmp, end, src - subs cntrem, cntin, tmp + subs cntrem, src, srcin b.ls L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) + sub cntrem, cntrem, 1 + tbz cntrem, 4, L(loop32_2) + add src, src, 16 - .p2align 4 + .p2align 5 L(loop32): - ldr qdata, [src, -16]! + ldr qdata, [src, -32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src, -16]! + ldr qdata, [src, -16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - b.ls L(end) + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) +L(end_2): + sub src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend add tmp, src, 15 diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S index 9fcd975..553b0fc 100644 --- a/string/aarch64/memset.S +++ b/string/aarch64/memset.S @@ -1,8 +1,8 @@ /* * memset - fill memory with a constant byte * - * Copyright (c) 2012-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * */ -#include "../asmdefs.h" +#include "asmdefs.h" #define dstin x0 #define val x1 diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S deleted file mode 100644 index f1c7119..0000000 --- a/string/aarch64/stpcpy-mte.S +++ /dev/null @@ -1,10 +0,0 @@ -/* - * stpcpy - copy a string returning pointer to end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#define BUILD_STPCPY 1 - -#include "strcpy-mte.S" diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S index 82dd971..5d3f14b 100644 --- a/string/aarch64/stpcpy-sve.S +++ b/string/aarch64/stpcpy-sve.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STPCPY 1 diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S index 4f62aa4..155c68d 100644 --- a/string/aarch64/stpcpy.S +++ b/string/aarch64/stpcpy.S @@ -2,7 +2,7 @@ * stpcpy - copy a string returning pointer to end. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STPCPY 1 diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S index dcb0e46..6ec08f7 100644 --- a/string/aarch64/strchr-mte.S +++ b/string/aarch64/strchr-mte.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,8 +19,7 @@ #define src x2 #define tmp1 x1 -#define wtmp2 w3 -#define tmp3 x3 +#define tmp2 x3 #define vrepchr v0 #define vdata v1 @@ -28,39 +27,30 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 -#define vend v6 -#define dend d6 +#define vend v5 +#define dend d5 /* Core algorithm. For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-1 are set if the relevant byte matched the - requested character, bits 2-3 are set if the byte is NUL (or matched), and - bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd - bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits - in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + per byte. Bits 0-1 are set if the relevant byte matched the requested + character, bits 2-3 are set if the byte is NUL or matched. Count trailing + zeroes gives the position of the matching byte if it is a multiple of 4. + If it is not a multiple of 4, there was no match. */ ENTRY (__strchr_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov wtmp2, 0x3003 - dup vrepmask.8h, wtmp2 + movi vrepmask.16b, 0x33 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp2, 0xf00f - dup vrepmask2.8h, wtmp2 - bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - lsl tmp3, srcin, 2 - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - + lsl tmp2, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend - lsr tmp1, tmp1, tmp3 + lsr tmp1, tmp1, tmp2 cbz tmp1, L(loop) rbit tmp1, tmp1 @@ -74,28 +64,34 @@ ENTRY (__strchr_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov tmp1, dend cbz tmp1, L(loop) + sub src, src, 16 +L(end): #ifdef __AARCH64EB__ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend #else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend rbit tmp1, tmp1 #endif + add src, src, 16 clz tmp1, tmp1 - /* Tmp1 is an even multiple of 2 if the target character was - found first. Otherwise we've found the end of string. */ + /* Tmp1 is a multiple of 4 if the target character was found. */ tst tmp1, 2 add result, src, tmp1, lsr 2 csel result, result, xzr, eq diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S index 13ba9f4..ff07516 100644 --- a/string/aarch64/strchr-sve.S +++ b/string/aarch64/strchr-sve.S @@ -1,11 +1,11 @@ /* * strchr/strchrnul - find a character in a string * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S index 1063cbf..37193bd 100644 --- a/string/aarch64/strchr.S +++ b/string/aarch64/strchr.S @@ -1,8 +1,8 @@ /* * strchr - find a character in a string * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S index 1b0d0a6..543ee88 100644 --- a/string/aarch64/strchrnul-mte.S +++ b/string/aarch64/strchrnul-mte.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -20,38 +20,32 @@ #define src x2 #define tmp1 x1 #define tmp2 x3 -#define tmp2w w3 #define vrepchr v0 #define vdata v1 #define qdata q1 #define vhas_nul v2 #define vhas_chr v3 -#define vrepmask v4 -#define vend v5 -#define dend d5 +#define vend v4 +#define dend d4 -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (__strchrnul_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov tmp2w, 0xf00f - dup vrepmask.8h, tmp2w cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b lsl tmp2, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov tmp1, dend lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ cbz tmp1, L(loop) @@ -63,15 +57,22 @@ ENTRY (__strchrnul_aarch64_mte) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b fmov tmp1, dend cbz tmp1, L(loop) - - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 +L(end): + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + add src, src, 16 fmov tmp1, dend #ifndef __AARCH64EB__ rbit tmp1, tmp1 diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S index 428ff1a..0005f91 100644 --- a/string/aarch64/strchrnul-sve.S +++ b/string/aarch64/strchrnul-sve.S @@ -2,7 +2,7 @@ * strchrnul - find a character or nul in a string * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #define BUILD_STRCHRNUL diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S index a4230d9..666e8d0 100644 --- a/string/aarch64/strchrnul.S +++ b/string/aarch64/strchrnul.S @@ -1,8 +1,8 @@ /* * strchrnul - find a character or nul in a string * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S deleted file mode 100644 index 12d1a6b..0000000 --- a/string/aarch64/strcmp-mte.S +++ /dev/null @@ -1,189 +0,0 @@ -/* - * strcmp - compare two strings - * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - - -/* Assumptions: - * - * ARMv8-a, AArch64. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f - -#define src1 x0 -#define src2 x1 -#define result x0 - -#define data1 x2 -#define data1w w2 -#define data2 x3 -#define data2w w3 -#define has_nul x4 -#define diff x5 -#define off1 x5 -#define syndrome x6 -#define tmp x6 -#define data3 x7 -#define zeroones x8 -#define shift x9 -#define off2 x10 - -/* On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. */ -#ifdef __AARCH64EB__ -# define LS_FW lsl -#else -# define LS_FW lsr -#endif - -/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. - Since carry propagation makes 0x1 bytes before a NUL byte appear - NUL too in big-endian, byte-reverse the data before the NUL check. */ - - -ENTRY (__strcmp_aarch64_mte) - PTR_ARG (0) - PTR_ARG (1) - sub off2, src2, src1 - mov zeroones, REP8_01 - and tmp, src1, 7 - tst off2, 7 - b.ne L(misaligned8) - cbnz tmp, L(mutual_align) - - .p2align 4 - -L(loop_aligned): - ldr data2, [src1, off2] - ldr data1, [src1], 8 -L(start_realigned): -#ifdef __AARCH64EB__ - rev tmp, data1 - sub has_nul, tmp, zeroones - orr tmp, tmp, REP8_7f -#else - sub has_nul, data1, zeroones - orr tmp, data1, REP8_7f -#endif - bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ - ccmp data1, data2, 0, eq - b.eq L(loop_aligned) -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - eor diff, data1, data2 - orr syndrome, diff, has_nul -L(end): -#ifndef __AARCH64EB__ - rev syndrome, syndrome - rev data1, data1 - rev data2, data2 -#endif - clz shift, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - lsl data1, data1, shift - lsl data2, data2, shift - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, 56 - sub result, data1, data2, lsr 56 - ret - - .p2align 4 - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. */ - bic src1, src1, 7 - ldr data2, [src1, off2] - ldr data1, [src1], 8 - neg shift, src2, lsl 3 /* Bits to alignment -64. */ - mov tmp, -1 - LS_FW tmp, tmp, shift - orr data1, data1, tmp - orr data2, data2, tmp - b L(start_realigned) - -L(misaligned8): - /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond the end of SRC2. */ - cbz tmp, L(src1_aligned) -L(do_misaligned): - ldrb data1w, [src1], 1 - ldrb data2w, [src2], 1 - cmp data1w, 0 - ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ - b.ne L(done) - tst src1, 7 - b.ne L(do_misaligned) - -L(src1_aligned): - neg shift, src2, lsl 3 - bic src2, src2, 7 - ldr data3, [src2], 8 -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - lsr tmp, zeroones, shift - orr data3, data3, tmp - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - bics has_nul, has_nul, tmp - b.ne L(tail) - - sub off1, src2, src1 - - .p2align 4 - -L(loop_unaligned): - ldr data3, [src1, off1] - ldr data2, [src1, off2] -#ifdef __AARCH64EB__ - rev data3, data3 -#endif - sub has_nul, data3, zeroones - orr tmp, data3, REP8_7f - ldr data1, [src1], 8 - bics has_nul, has_nul, tmp - ccmp data1, data2, 0, eq - b.eq L(loop_unaligned) - - lsl tmp, has_nul, shift -#ifdef __AARCH64EB__ - rev tmp, tmp -#endif - eor diff, data1, data2 - orr syndrome, diff, tmp - cbnz syndrome, L(end) -L(tail): - ldr data1, [src1] - neg shift, shift - lsr data2, data3, shift - lsr has_nul, has_nul, shift -#ifdef __AARCH64EB__ - rev data2, data2 - rev has_nul, has_nul -#endif - eor diff, data1, data2 - orr syndrome, diff, has_nul - b L(end) - -L(done): - sub result, data1, data2 - ret - -END (__strcmp_aarch64_mte) - diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S index e6d2da5..eaf909a 100644 --- a/string/aarch64/strcmp-sve.S +++ b/string/aarch64/strcmp-sve.S @@ -1,11 +1,11 @@ /* * __strcmp_aarch64_sve - compare two strings * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S index 7714ebf..137a9aa 100644 --- a/string/aarch64/strcmp.S +++ b/string/aarch64/strcmp.S @@ -1,168 +1,184 @@ /* * strcmp - compare two strings * - * Copyright (c) 2012-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2012-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ + /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 -/* Parameters and result. */ #define src1 x0 #define src2 x1 #define result x0 -/* Internal variables. */ #define data1 x2 #define data1w w2 #define data2 x3 #define data2w w3 #define has_nul x4 #define diff x5 +#define off1 x5 #define syndrome x6 -#define tmp1 x7 -#define tmp2 x8 -#define tmp3 x9 -#define zeroones x10 -#define pos x11 +#define tmp x6 +#define data3 x7 +#define zeroones x8 +#define shift x9 +#define off2 x10 + +/* On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. */ +#ifdef __AARCH64EB__ +# define LS_FW lsl +#else +# define LS_FW lsr +#endif + +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. + Since carry propagation makes 0x1 bytes before a NUL byte appear + NUL too in big-endian, byte-reverse the data before the NUL check. */ + - /* Start of performance-critical section -- one 64B cache line. */ ENTRY (__strcmp_aarch64) PTR_ARG (0) PTR_ARG (1) - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 + sub off2, src2, src1 + mov zeroones, REP8_01 + and tmp, src1, 7 + tst off2, 7 b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ + cbnz tmp, L(mutual_align) + + .p2align 4 + L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 + ldr data2, [src1, off2] + ldr data1, [src1], 8 L(start_realigned): - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev tmp, data1 + sub has_nul, tmp, zeroones + orr tmp, tmp, REP8_7f +#else + sub has_nul, data1, zeroones + orr tmp, data1, REP8_7f +#endif + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ + ccmp data1, data2, 0, eq + b.eq L(loop_aligned) +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_aligned) - /* End of performance-critical section -- one 64B cache line. */ - L(end): -#ifndef __AARCH64EB__ +#ifndef __AARCH64EB__ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ - clz pos, syndrome rev data2, data2 - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#else - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. +#endif + clz shift, syndrome + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ - lsl data1, data1, pos - lsl data2, data2, pos + lsl data1, data1, shift + lsl data2, data2, shift /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 + lsr data1, data1, 56 + sub result, data1, data2, lsr 56 ret -#endif + + .p2align 4 L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off - the bytes that preceed the start point. */ - bic src1, src1, #7 - bic src2, src2, #7 - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - ldr data1, [src1], #8 - neg tmp1, tmp1 /* Bits to alignment -64. */ - ldr data2, [src2], #8 - mov tmp2, #~0 -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#endif - orr data1, data1, tmp2 - orr data2, data2, tmp2 + the bytes that precede the start point. */ + bic src1, src1, 7 + ldr data2, [src1, off2] + ldr data1, [src1], 8 + neg shift, src2, lsl 3 /* Bits to alignment -64. */ + mov tmp, -1 + LS_FW tmp, tmp, shift + orr data1, data1, tmp + orr data2, data2, tmp b L(start_realigned) L(misaligned8): /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always - checking to make sure that we don't access beyond page boundary in - SRC2. */ - tst src1, #7 - b.eq L(loop_misaligned) + checking to make sure that we don't access beyond the end of SRC2. */ + cbz tmp, L(src1_aligned) L(do_misaligned): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + cmp data1w, 0 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ b.ne L(done) - tst src1, #7 + tst src1, 7 b.ne L(do_misaligned) -L(loop_misaligned): - /* Test if we are within the last dword of the end of a 4K page. If - yes then jump back to the misaligned loop to copy a byte at a time. */ - and tmp1, src2, #0xff8 - eor tmp1, tmp1, #0xff8 - cbz tmp1, L(do_misaligned) - ldr data1, [src1], #8 - ldr data2, [src2], #8 - - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ +L(src1_aligned): + neg shift, src2, lsl 3 + bic src2, src2, 7 + ldr data3, [src2], 8 +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + lsr tmp, zeroones, shift + orr data3, data3, tmp + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + bics has_nul, has_nul, tmp + b.ne L(tail) + + sub off1, src2, src1 + + .p2align 4 + +L(loop_unaligned): + ldr data3, [src1, off1] + ldr data2, [src1, off2] +#ifdef __AARCH64EB__ + rev data3, data3 +#endif + sub has_nul, data3, zeroones + orr tmp, data3, REP8_7f + ldr data1, [src1], 8 + bics has_nul, has_nul, tmp + ccmp data1, data2, 0, eq + b.eq L(loop_unaligned) + + lsl tmp, has_nul, shift +#ifdef __AARCH64EB__ + rev tmp, tmp +#endif + eor diff, data1, data2 + orr syndrome, diff, tmp + cbnz syndrome, L(end) +L(tail): + ldr data1, [src1] + neg shift, shift + lsr data2, data3, shift + lsr has_nul, has_nul, shift +#ifdef __AARCH64EB__ + rev data2, data2 + rev has_nul, has_nul +#endif + eor diff, data1, data2 orr syndrome, diff, has_nul - cbz syndrome, L(loop_misaligned) b L(end) L(done): diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S deleted file mode 100644 index 88c222d..0000000 --- a/string/aarch64/strcpy-mte.S +++ /dev/null @@ -1,161 +0,0 @@ -/* - * strcpy/stpcpy - copy a string returning pointer to start/end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, Advanced SIMD. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define dstin x0 -#define srcin x1 -#define result x0 - -#define src x2 -#define dst x3 -#define len x4 -#define synd x4 -#define tmp x5 -#define wtmp w5 -#define shift x5 -#define data1 x6 -#define dataw1 w6 -#define data2 x7 -#define dataw2 w7 - -#define dataq q0 -#define vdata v0 -#define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 -#define dataq2 q1 - -#ifdef BUILD_STPCPY -# define STRCPY __stpcpy_aarch64_mte -# define IFSTPCPY(X,...) X,__VA_ARGS__ -#else -# define STRCPY __strcpy_aarch64_mte -# define IFSTPCPY(X,...) -#endif - -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ - -ENTRY (STRCPY) - PTR_ARG (0) - PTR_ARG (1) - bic src, srcin, 15 - mov wtmp, 0xf00f - ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp - cmeq vhas_nul.16b, vdata.16b, 0 - lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - lsr synd, synd, shift - cbnz synd, L(tail) - - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(start_loop) - -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - sub tmp, src, srcin - clz len, synd - add len, tmp, len, lsr 2 - tbz len, 4, L(less16) - sub tmp, len, 15 - ldr dataq, [srcin] - ldr dataq2, [srcin, tmp] - str dataq, [dstin] - str dataq2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4,,8 -L(tail): - rbit synd, synd - clz len, synd - lsr len, len, 2 - - .p2align 4 -L(less16): - tbz len, 3, L(less8) - sub tmp, len, 7 - ldr data1, [srcin] - ldr data2, [srcin, tmp] - str data1, [dstin] - str data2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(less8): - subs tmp, len, 3 - b.lo L(less4) - ldr dataw1, [srcin] - ldr dataw2, [srcin, tmp] - str dataw1, [dstin] - str dataw2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - -L(less4): - cbz len, L(zerobyte) - ldrh dataw1, [srcin] - strh dataw1, [dstin] -L(zerobyte): - strb wzr, [dstin, len] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(start_loop): - sub len, src, srcin - ldr dataq2, [srcin] - add dst, dstin, len - str dataq2, [dstin] - - .p2align 5 -L(loop): - str dataq, [dst], 16 - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - fmov synd, dend -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - clz len, synd - lsr len, len, 2 - sub tmp, len, 15 - ldr dataq, [src, tmp] - str dataq, [dst, tmp] - IFSTPCPY (add result, dst, len) - ret - -END (STRCPY) diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S index f515462..00e72dc 100644 --- a/string/aarch64/strcpy-sve.S +++ b/string/aarch64/strcpy-sve.S @@ -1,11 +1,11 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index 6e9ed42..97ae37e 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -1,311 +1,156 @@ /* * strcpy/stpcpy - copy a string returning pointer to start/end. * - * Copyright (c) 2013-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" -/* To build as stpcpy, define BUILD_STPCPY before compiling this file. - - To test the page crossing code path more thoroughly, compile with - -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ #define dstin x0 #define srcin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 #define dst x3 -#define data1 x4 -#define data1w w4 -#define data2 x5 -#define data2w w5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define data1a x13 -#define data2a x14 -#define pos x15 -#define len x16 -#define to_align x17 +#define len x4 +#define synd x4 +#define tmp x5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vend v2 +#define dend d2 +#define dataq2 q1 #ifdef BUILD_STPCPY -#define STRCPY __stpcpy_aarch64 -#else -#define STRCPY __strcpy_aarch64 -#endif - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - /* AArch64 systems have a minimum page size of 4k. We can do a quick - page size check for crossing this boundary on entry and if we - do not, then we can short-circuit much of the entry code. We - expect early page-crossing strings to be rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite - predictable, even with random strings. - - We don't bother checking for larger page sizes, the cost of setting - up the correct page size is just not worth the extra gain from - a small reduction in the cases taking the slow path. Note that - we only care about whether the first fetch, which may be - misaligned, crosses a page boundary - after that we move to aligned - fetches for the remainder of the string. */ - -#ifdef STRCPY_TEST_PAGE_CROSS - /* Make everything that isn't Qword aligned look like a page cross. */ -#define MIN_PAGE_P2 4 +# define STRCPY __stpcpy_aarch64 +# define IFSTPCPY(X,...) X,__VA_ARGS__ #else -#define MIN_PAGE_P2 12 +# define STRCPY __strcpy_aarch64 +# define IFSTPCPY(X,...) #endif -#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) +/* + Core algorithm: + For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits + per byte. We take 4 bits of every comparison byte with shift right and narrow + by 4 instruction. Since the bits in the nibble mask reflect the order in + which things occur in the original string, counting leading zeros identifies + exactly which byte matched. */ ENTRY (STRCPY) PTR_ARG (0) PTR_ARG (1) - /* For moderately short strings, the fastest way to do the copy is to - calculate the length of the string in the same way as strlen, then - essentially do a memcpy of the result. This avoids the need for - multiple byte copies and further means that by the time we - reach the bulk copy loop we know we can always use DWord - accesses. We expect __strcpy_aarch64 to rarely be called repeatedly - with the same source string, so branch prediction is likely to - always be difficult - we mitigate against this by preferring - conditional select operations over branches whenever this is - feasible. */ - and tmp2, srcin, #(MIN_PAGE_SIZE - 1) - mov zeroones, #REP8_01 - and to_align, srcin, #15 - cmp tmp2, #(MIN_PAGE_SIZE - 16) - neg tmp1, to_align - /* The first fetch will straddle a (possible) page boundary iff - srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte - aligned string will never fail the page align check, so will - always take the fast path. */ - b.gt L(page_cross) - -L(page_cross_ok): - ldp data1, data2, [srcin] -#ifdef __AARCH64EB__ - /* Because we expect the end to be found within 16 characters - (profiling shows this is the most common case), it's worth - swapping the bytes now to save having to recalculate the - termination syndrome later. We preserve data1 and data2 - so that we can re-use the values later on. */ - rev tmp2, data1 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne L(fp_le8) - rev tmp4, data2 - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bics has_nul1, tmp1, tmp2 - b.ne L(fp_le8) - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f + bic src, srcin, 15 + ld1 {vdata.16b}, [src] + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd #endif - bics has_nul2, tmp3, tmp4 - b.eq L(bulk_entry) + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret - /* The string is short (<=16 bytes). We don't know exactly how - short though, yet. Work out the exact length so that we can - quickly select the optimal copy strategy. */ -L(fp_gt8): - rev has_nul2, has_nul2 - clz pos, has_nul2 - mov tmp2, #56 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - sub pos, tmp2, pos -#ifdef __AARCH64EB__ - lsr data2, data2, pos -#else - lsl data2, data2, pos -#endif - str data2, [dst, #1] +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] str data1, [dstin] -#ifdef BUILD_STPCPY - add dstin, dst, #8 -#endif + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -L(fp_le8): - rev has_nul1, has_nul1 - clz pos, has_nul1 - add dst, dstin, pos, lsr #3 /* Bits to bytes. */ - subs tmp2, pos, #24 /* Pos in bits. */ - b.lt L(fp_lt4) -#ifdef __AARCH64EB__ - mov tmp2, #56 - sub pos, tmp2, pos - lsr data2, data1, pos - lsr data1, data1, #32 -#else - lsr data2, data1, tmp2 -#endif - /* 4->7 bytes to copy. */ - str data2w, [dst, #-3] - str data1w, [dstin] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) ret -L(fp_lt4): - cbz pos, L(fp_lt2) - /* 2->3 bytes to copy. */ -#ifdef __AARCH64EB__ - lsr data1, data1, #48 -#endif - strh data1w, [dstin] - /* Fall-through, one byte (max) to go. */ -L(fp_lt2): - /* Null-terminated string. Last character must be zero! */ - strb wzr, [dst] -#ifdef BUILD_STPCPY - mov dstin, dst -#endif - ret - - .p2align 6 - /* Aligning here ensures that the entry code and main loop all lies - within one 64-byte cache line. */ -L(bulk_entry): - sub to_align, to_align, #16 - stp data1, data2, [dstin] - sub src, srcin, to_align - sub dst, dstin, to_align - b L(entry_no_page_cross) - - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ -L(main_loop): - stp data1, data2, [dst], #16 -L(entry_no_page_cross): - ldp data1, data2, [src], #16 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(main_loop) - /* Since we know we are copying at least 16 bytes, the fastest way - to deal with the tail is to determine the location of the - trailing NUL, then (re)copy the 16 bytes leading up to that. */ - cmp has_nul1, #0 -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, ne - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, ne -#endif - rev has_nul1, has_nul1 - clz pos, has_nul1 - add tmp1, pos, #72 - add pos, pos, #8 - csel pos, pos, tmp1, ne - add src, src, pos, lsr #3 - add dst, dst, pos, lsr #3 - ldp data1, data2, [src, #-32] - stp data1, data2, [dst, #-16] -#ifdef BUILD_STPCPY - sub dstin, dst, #1 -#endif +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) ret -L(page_cross): - bic src, srcin, #15 - /* Start by loading two words at [srcin & ~15], then forcing the - bytes that precede srcin to 0xff. This means they never look - like termination bytes. */ - ldp data1, data2, [src] - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - tst to_align, #7 - csetm tmp2, ne -#ifdef __AARCH64EB__ - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ -#else - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ + .p2align 4 +L(start_loop): + sub tmp, srcin, dstin + ldr dataq2, [srcin] + sub dst, src, tmp + str dataq2, [dstin] +L(loop): + str dataq, [dst], 32 + ldr dataq, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loopend) + str dataq, [dst, -16] + ldr dataq, [src, 32]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + add dst, dst, 16 +L(loopend): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ + fmov synd, dend + sub dst, dst, 31 +#ifndef __AARCH64EB__ + rbit synd, synd #endif - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - cmp to_align, #8 - csinv data1, data1, xzr, lt - csel data2, data2, data2a, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(page_cross_ok) - /* We now need to make data1 and data2 look like they've been - loaded directly from srcin. Do a rotate on the 128-bit value. */ - lsl tmp1, to_align, #3 /* Bytes->bits. */ - neg tmp2, to_align, lsl #3 -#ifdef __AARCH64EB__ - lsl data1a, data1, tmp1 - lsr tmp4, data2, tmp2 - lsl data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - rev tmp2, data1 - rev tmp4, data2 - sub tmp1, tmp2, zeroones - orr tmp2, tmp2, #REP8_7f - sub tmp3, tmp4, zeroones - orr tmp4, tmp4, #REP8_7f -#else - lsr data1a, data1, tmp1 - lsl tmp4, data2, tmp2 - lsr data2, data2, tmp1 - orr tmp4, tmp4, data1a - cmp to_align, #8 - csel data1, tmp4, data2, lt - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f -#endif - bic has_nul1, tmp1, tmp2 - cbnz has_nul1, L(fp_le8) - bic has_nul2, tmp3, tmp4 - b L(fp_gt8) + clz len, synd + lsr len, len, 2 + add dst, dst, len + ldr dataq, [dst, tmp] + str dataq, [dst] + IFSTPCPY (add result, dst, 15) + ret END (STRCPY) - diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S index 7cf41d5..7723579 100644 --- a/string/aarch64/strlen-mte.S +++ b/string/aarch64/strlen-mte.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define result x0 @@ -19,35 +19,26 @@ #define src x1 #define synd x2 #define tmp x3 -#define wtmp w3 #define shift x4 #define data q0 #define vdata v0 #define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strlen_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 - mov wtmp, 0xf00f ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp cmeq vhas_nul.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(loop) @@ -59,19 +50,25 @@ ENTRY (__strlen_aarch64_mte) .p2align 5 L(loop): - ldr data, [src, 16]! + ldr data, [src, 16] + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop_end) + ldr data, [src, 32]! cmeq vhas_nul.16b, vdata.16b, 0 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + sub src, src, 16 +L(loop_end): + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ sub result, src, srcin fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif + add result, result, 16 clz tmp, synd add result, result, tmp, lsr 2 ret diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S index 2392493..12ebbdb 100644 --- a/string/aarch64/strlen-sve.S +++ b/string/aarch64/strlen-sve.S @@ -1,11 +1,11 @@ /* * __strlen_aarch64_sve - compute the length of a string * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index a1b164a..6f6f08f 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -1,8 +1,8 @@ /* * strlen - calculate the length of a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Not MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define len x0 @@ -36,6 +36,7 @@ #define tmp x2 #define tmpw w2 #define synd x3 +#define syndw w3 #define shift x4 /* For the first 32 bytes, NUL detection works on the principle that @@ -110,7 +111,6 @@ ENTRY (__strlen_aarch64) add len, len, tmp1, lsr 3 ret - .p2align 3 /* Look for a NUL byte at offset 16..31 in the string. */ L(bytes16_31): ldp data1, data2, [srcin, 16] @@ -138,6 +138,7 @@ L(bytes16_31): add len, len, tmp1, lsr 3 ret + nop L(loop_entry): bic src, srcin, 31 @@ -153,18 +154,12 @@ L(loop): /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ cmeq maskv.16b, datav1.16b, 0 sub len, src, srcin - tst synd, 0xffffffff - b.ne 1f + cbnz syndw, 1f cmeq maskv.16b, datav2.16b, 0 add len, len, 16 1: /* Generate a bitmask and compute correct byte offset. */ -#ifdef __AARCH64EB__ - bic maskv.8h, 0xf0 -#else - bic maskv.8h, 0x0f, lsl 8 -#endif - umaxp maskv.16b, maskv.16b, maskv.16b + shrn maskv.8b, maskv.8h, 4 fmov synd, maskd #ifndef __AARCH64EB__ rbit synd, synd @@ -173,8 +168,6 @@ L(loop): add len, len, tmp, lsr 2 ret - .p2align 4 - L(page_cross): bic src, srcin, 31 mov tmpw, 0x0c03 diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S deleted file mode 100644 index c9d6fc8..0000000 --- a/string/aarch64/strncmp-mte.S +++ /dev/null @@ -1,307 +0,0 @@ -/* - * strncmp - compare two strings - * - * Copyright (c) 2013-2021, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64 - */ - -#include "../asmdefs.h" - -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f - -/* Parameters and result. */ -#define src1 x0 -#define src2 x1 -#define limit x2 -#define result x0 - -/* Internal variables. */ -#define data1 x3 -#define data1w w3 -#define data2 x4 -#define data2w w4 -#define has_nul x5 -#define diff x6 -#define syndrome x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define zeroones x11 -#define pos x12 -#define mask x13 -#define endloop x14 -#define count mask -#define offset pos -#define neg_offset x15 - -/* Define endian dependent shift operations. - On big-endian early bytes are at MSB and on little-endian LSB. - LS_FW means shifting towards early bytes. - LS_BK means shifting towards later bytes. - */ -#ifdef __AARCH64EB__ -#define LS_FW lsl -#define LS_BK lsr -#else -#define LS_FW lsr -#define LS_BK lsl -#endif - -ENTRY (__strncmp_aarch64_mte) - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - cbz limit, L(ret0) - eor tmp1, src1, src2 - mov zeroones, #REP8_01 - tst tmp1, #7 - and count, src1, #7 - b.ne L(misaligned8) - cbnz count, L(mutual_align) - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - .p2align 4 -L(loop_aligned): - ldr data1, [src1], #8 - ldr data2, [src2], #8 -L(start_realigned): - subs limit, limit, #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, hi /* Last Dword or differences. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp endloop, #0, #0, eq - b.eq L(loop_aligned) - /* End of main loop */ - -L(full_check): -#ifndef __AARCH64EB__ - orr syndrome, diff, has_nul - add limit, limit, 8 /* Rewind limit to before last subs. */ -L(syndrome_check): - /* Limit was reached. Check if the NUL byte or the difference - is before the limit. */ - rev syndrome, syndrome - rev data1, data1 - clz pos, syndrome - rev data2, data2 - lsl data1, data1, pos - cmp limit, pos, lsr #3 - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - csel result, result, xzr, hi - ret -#else - /* Not reached the limit, must have found the end or a diff. */ - tbz limit, #63, L(not_limit) - add tmp1, limit, 8 - cbz limit, L(not_limit) - - lsl limit, tmp1, #3 /* Bits -> bytes. */ - mov mask, #~0 - lsr mask, mask, limit - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): - /* For big-endian we cannot use the trick with the syndrome value - as carry-propagation can corrupt the upper bits if the trailing - bytes in the string contain 0x01. */ - /* However, if there is no NUL byte in the dword, we can generate - the result directly. We can't just subtract the bytes as the - MSB might be significant. */ - cbnz has_nul, 1f - cmp data1, data2 - cset result, ne - cneg result, result, lo - ret -1: - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ - rev tmp3, data1 - sub tmp1, tmp3, zeroones - orr tmp2, tmp3, #REP8_7f - bic has_nul, tmp1, tmp2 - rev has_nul, has_nul - orr syndrome, diff, has_nul - clz pos, syndrome - /* The most-significant-non-zero bit of the syndrome marks either the - first bit that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ -L(end_quick): - lsl data1, data1, pos - lsl data2, data2, pos - /* But we need to zero-extend (char is unsigned) the value and then - perform a signed 32-bit subtraction. */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret -#endif - -L(mutual_align): - /* Sources are mutually aligned, but are not currently at an - alignment boundary. Round down the addresses and then mask off - the bytes that precede the start point. - We also need to adjust the limit calculations, but without - overflowing if the limit is near ULONG_MAX. */ - bic src1, src1, #7 - bic src2, src2, #7 - ldr data1, [src1], #8 - neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ - ldr data2, [src2], #8 - mov tmp2, #~0 - LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ - /* Adjust the limit and ensure it doesn't overflow. */ - adds limit, limit, count - csinv limit, limit, xzr, lo - orr data1, data1, tmp2 - orr data2, data2, tmp2 - b L(start_realigned) - - .p2align 4 - /* Don't bother with dwords for up to 16 bytes. */ -L(misaligned8): - cmp limit, #16 - b.hs L(try_misaligned_words) - -L(byte_loop): - /* Perhaps we can do better than this. */ - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq L(byte_loop) -L(done): - sub result, data1, data2 - ret - /* Align the SRC1 to a dword by doing a bytewise compare and then do - the dword loop. */ -L(try_misaligned_words): - cbz count, L(src1_aligned) - - neg count, count - and count, count, #7 - sub limit, limit, count - -L(page_end_loop): - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - cmp data1w, #1 - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.ne L(done) - subs count, count, #1 - b.hi L(page_end_loop) - - /* The following diagram explains the comparison of misaligned strings. - The bytes are shown in natural order. For little-endian, it is - reversed in the registers. The "x" bytes are before the string. - The "|" separates data that is loaded at one time. - src1 | a a a a a a a a | b b b c c c c c | . . . - src2 | x x x x x a a a a a a a a b b b | c c c c c . . . - - After shifting in each step, the data looks like this: - STEP_A STEP_B STEP_C - data1 a a a a a a a a b b b c c c c c b b b c c c c c - data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c - - The bytes with "0" are eliminated from the syndrome via mask. - - Align SRC2 down to 16 bytes. This way we can read 16 bytes at a - time from SRC2. The comparison happens in 3 steps. After each step - the loop can exit, or read from SRC1 or SRC2. */ -L(src1_aligned): - /* Calculate offset from 8 byte alignment to string start in bits. No - need to mask offset since shifts are ignoring upper bits. */ - lsl offset, src2, #3 - bic src2, src2, #0xf - mov mask, -1 - neg neg_offset, offset - ldr data1, [src1], #8 - ldp tmp1, tmp2, [src2], #16 - LS_BK mask, mask, neg_offset - and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ - /* Skip the first compare if data in tmp1 is irrelevant. */ - tbnz offset, 6, L(misaligned_mid_loop) - -L(loop_misaligned): - /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ - LS_FW data2, tmp1, offset - LS_BK tmp1, tmp2, neg_offset - subs limit, limit, #8 - orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ - sub has_nul, data1, zeroones - eor diff, data1, data2 /* Non-zero if differences found. */ - orr tmp3, data1, #REP8_7f - csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ - orr tmp3, endloop, has_nul - cbnz tmp3, L(full_check) - - ldr data1, [src1], #8 -L(misaligned_mid_loop): - /* STEP_B: Compare first part of data1 to second part of tmp2. */ - LS_FW data2, tmp2, offset -#ifdef __AARCH64EB__ - /* For big-endian we do a byte reverse to avoid carry-propagation - problem described above. This way we can reuse the has_nul in the - next step and also use syndrome value trick at the end. */ - rev tmp3, data1 - #define data1_fixed tmp3 -#else - #define data1_fixed data1 -#endif - sub has_nul, data1_fixed, zeroones - orr tmp3, data1_fixed, #REP8_7f - eor diff, data2, data1 /* Non-zero if differences found. */ - bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ -#ifdef __AARCH64EB__ - rev has_nul, has_nul -#endif - cmp limit, neg_offset, lsr #3 - orr syndrome, diff, has_nul - bic syndrome, syndrome, mask /* Ignore later bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - /* STEP_C: Compare second part of data1 to first part of tmp1. */ - ldp tmp1, tmp2, [src2], #16 - cmp limit, #8 - LS_BK data2, tmp1, neg_offset - eor diff, data2, data1 /* Non-zero if differences found. */ - orr syndrome, diff, has_nul - and syndrome, syndrome, mask /* Ignore earlier bytes. */ - csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ - cbnz tmp3, L(syndrome_check) - - ldr data1, [src1], #8 - sub limit, limit, #8 - b L(loop_misaligned) - -#ifdef __AARCH64EB__ -L(syndrome_check): - clz pos, syndrome - cmp pos, limit, lsl #3 - b.lo L(end_quick) -#endif - -L(ret0): - mov result, #0 - ret -END(__strncmp_aarch64_mte) - diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S index 234190e..6a9e9f7 100644 --- a/string/aarch64/strncmp-sve.S +++ b/string/aarch64/strncmp-sve.S @@ -1,11 +1,11 @@ /* * strncmp - compare two strings with limit * - * Copyright (c) 2018-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index 738b653..128a10c 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -1,20 +1,20 @@ /* * strncmp - compare two strings * - * Copyright (c) 2013-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2013-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64. + * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 /* Parameters and result. */ #define src1 x0 @@ -35,10 +35,24 @@ #define tmp3 x10 #define zeroones x11 #define pos x12 -#define limit_wd x13 -#define mask x14 -#define endloop x15 +#define mask x13 +#define endloop x14 #define count mask +#define offset pos +#define neg_offset x15 + +/* Define endian dependent shift operations. + On big-endian early bytes are at MSB and on little-endian LSB. + LS_FW means shifting towards early bytes. + LS_BK means shifting towards later bytes. + */ +#ifdef __AARCH64EB__ +#define LS_FW lsl +#define LS_BK lsr +#else +#define LS_FW lsr +#define LS_BK lsl +#endif ENTRY (__strncmp_aarch64) PTR_ARG (0) @@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64) and count, src1, #7 b.ne L(misaligned8) cbnz count, L(mutual_align) - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and @@ -63,56 +74,52 @@ L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 L(start_realigned): - subs limit_wd, limit_wd, #1 + subs limit, limit, #8 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, pl /* Last Dword or differences. */ + csinv endloop, diff, xzr, hi /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq b.eq L(loop_aligned) /* End of main loop */ - /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, L(not_limit) - - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq L(not_limit) - - lsl limit, limit, #3 /* Bits -> bytes. */ - mov mask, #~0 -#ifdef __AARCH64EB__ - lsr mask, mask, limit -#else - lsl mask, mask, limit -#endif - bic data1, data1, mask - bic data2, data2, mask - - /* Make sure that the NUL byte is marked in the syndrome. */ - orr has_nul, has_nul, mask - -L(not_limit): +L(full_check): +#ifndef __AARCH64EB__ orr syndrome, diff, has_nul - -#ifndef __AARCH64EB__ + add limit, limit, 8 /* Rewind limit to before last subs. */ +L(syndrome_check): + /* Limit was reached. Check if the NUL byte or the difference + is before the limit. */ rev syndrome, syndrome rev data1, data1 - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. - Shifting left now will bring the critical information into the - top bits. */ clz pos, syndrome rev data2, data2 lsl data1, data1, pos + cmp limit, pos, lsr #3 lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then perform a signed 32-bit subtraction. */ lsr data1, data1, #56 sub result, data1, data2, lsr #56 + csel result, result, xzr, hi ret #else + /* Not reached the limit, must have found the end or a diff. */ + tbz limit, #63, L(not_limit) + add tmp1, limit, 8 + cbz limit, L(not_limit) + + lsl limit, tmp1, #3 /* Bits -> bytes. */ + mov mask, #~0 + lsr mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +L(not_limit): /* For big-endian we cannot use the trick with the syndrome value as carry-propagation can corrupt the upper bits if the trailing bytes in the string contain 0x01. */ @@ -133,10 +140,11 @@ L(not_limit): rev has_nul, has_nul orr syndrome, diff, has_nul clz pos, syndrome - /* The MS-non-zero bit of the syndrome marks either the first bit - that is different, or the top bit of the first zero byte. + /* The most-significant-non-zero bit of the syndrome marks either the + first bit that is different, or the top bit of the first zero byte. Shifting left now will bring the critical information into the top bits. */ +L(end_quick): lsl data1, data1, pos lsl data2, data2, pos /* But we need to zero-extend (char is unsigned) the value and then @@ -158,22 +166,12 @@ L(mutual_align): neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ -#endif - and tmp3, limit_wd, #7 - lsr limit_wd, limit_wd, #3 - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, count - add tmp3, tmp3, count + LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo orr data1, data1, tmp2 orr data2, data2, tmp2 - add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) .p2align 4 @@ -196,13 +194,11 @@ L(done): /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ L(try_misaligned_words): - lsr limit_wd, limit, #3 - cbz count, L(do_misaligned) + cbz count, L(src1_aligned) neg count, count and count, count, #7 sub limit, limit, count - lsr limit_wd, limit, #3 L(page_end_loop): ldrb data1w, [src1], #1 @@ -213,48 +209,100 @@ L(page_end_loop): subs count, count, #1 b.hi L(page_end_loop) -L(do_misaligned): - /* Prepare ourselves for the next page crossing. Unlike the aligned - loop, we fetch 1 less dword because we risk crossing bounds on - SRC2. */ - mov count, #8 - subs limit_wd, limit_wd, #1 - b.lo L(done_loop) -L(loop_misaligned): - and tmp2, src2, #0xff8 - eor tmp2, tmp2, #0xff8 - cbz tmp2, L(page_end_loop) + /* The following diagram explains the comparison of misaligned strings. + The bytes are shown in natural order. For little-endian, it is + reversed in the registers. The "x" bytes are before the string. + The "|" separates data that is loaded at one time. + src1 | a a a a a a a a | b b b c c c c c | . . . + src2 | x x x x x a a a a a a a a b b b | c c c c c . . . + + After shifting in each step, the data looks like this: + STEP_A STEP_B STEP_C + data1 a a a a a a a a b b b c c c c c b b b c c c c c + data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c + The bytes with "0" are eliminated from the syndrome via mask. + + Align SRC2 down to 16 bytes. This way we can read 16 bytes at a + time from SRC2. The comparison happens in 3 steps. After each step + the loop can exit, or read from SRC1 or SRC2. */ +L(src1_aligned): + /* Calculate offset from 8 byte alignment to string start in bits. No + need to mask offset since shifts are ignoring upper bits. */ + lsl offset, src2, #3 + bic src2, src2, #0xf + mov mask, -1 + neg neg_offset, offset ldr data1, [src1], #8 - ldr data2, [src2], #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne L(not_limit) - subs limit_wd, limit_wd, #1 - b.pl L(loop_misaligned) + ldp tmp1, tmp2, [src2], #16 + LS_BK mask, mask, neg_offset + and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ + /* Skip the first compare if data in tmp1 is irrelevant. */ + tbnz offset, 6, L(misaligned_mid_loop) -L(done_loop): - /* We found a difference or a NULL before the limit was reached. */ - and limit, limit, #7 - cbz limit, L(not_limit) - /* Read the last word. */ - sub src1, src1, 8 - sub src2, src2, 8 - ldr data1, [src1, limit] - ldr data2, [src2, limit] - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f +L(loop_misaligned): + /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ + LS_FW data2, tmp1, offset + LS_BK tmp1, tmp2, neg_offset + subs limit, limit, #8 + orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ + sub has_nul, data1, zeroones eor diff, data1, data2 /* Non-zero if differences found. */ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - ccmp diff, #0, #0, eq - b.ne L(not_limit) + orr tmp3, data1, #REP8_7f + csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ + orr tmp3, endloop, has_nul + cbnz tmp3, L(full_check) + + ldr data1, [src1], #8 +L(misaligned_mid_loop): + /* STEP_B: Compare first part of data1 to second part of tmp2. */ + LS_FW data2, tmp2, offset +#ifdef __AARCH64EB__ + /* For big-endian we do a byte reverse to avoid carry-propagation + problem described above. This way we can reuse the has_nul in the + next step and also use syndrome value trick at the end. */ + rev tmp3, data1 + #define data1_fixed tmp3 +#else + #define data1_fixed data1 +#endif + sub has_nul, data1_fixed, zeroones + orr tmp3, data1_fixed, #REP8_7f + eor diff, data2, data1 /* Non-zero if differences found. */ + bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ +#ifdef __AARCH64EB__ + rev has_nul, has_nul +#endif + cmp limit, neg_offset, lsr #3 + orr syndrome, diff, has_nul + bic syndrome, syndrome, mask /* Ignore later bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + /* STEP_C: Compare second part of data1 to first part of tmp1. */ + ldp tmp1, tmp2, [src2], #16 + cmp limit, #8 + LS_BK data2, tmp1, neg_offset + eor diff, data2, data1 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + and syndrome, syndrome, mask /* Ignore earlier bytes. */ + csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ + cbnz tmp3, L(syndrome_check) + + ldr data1, [src1], #8 + sub limit, limit, #8 + b L(loop_misaligned) + +#ifdef __AARCH64EB__ +L(syndrome_check): + clz pos, syndrome + cmp pos, limit, lsl #3 + b.lo L(end_quick) +#endif L(ret0): mov result, #0 ret - -END ( __strncmp_aarch64) +END(__strncmp_aarch64) diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S index 5b9ebf7..6c43dc4 100644 --- a/string/aarch64/strnlen-sve.S +++ b/string/aarch64/strnlen-sve.S @@ -1,11 +1,11 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index 48d2495..f2090a7 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -1,8 +1,8 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define cntin x1 @@ -20,39 +20,30 @@ #define src x2 #define synd x3 #define shift x4 -#define wtmp w4 #define tmp x4 #define cntrem x5 #define qdata q0 #define vdata v0 #define vhas_chr v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 +#define vend v2 +#define dend d2 /* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + Process the string in 16-byte aligned chunks. Compute a 64-bit mask with + four bits per byte using the shrn instruction. A count trailing zeros then + identifies the first zero byte. */ ENTRY (__strnlen_aarch64) PTR_ARG (0) SIZE_ARG (1) bic src, srcin, 15 - mov wtmp, 0xf00f cbz cntin, L(nomatch) - ld1 {vdata.16b}, [src], 16 - dup vrepmask.8h, wtmp + ld1 {vdata.16b}, [src] cmeq vhas_chr.16b, vdata.16b, 0 lsl shift, srcin, 2 - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ fmov synd, dend lsr synd, synd, shift cbz synd, L(start_loop) @@ -64,37 +55,40 @@ L(finish): csel result, cntin, result, ls ret +L(nomatch): + mov result, cntin + ret + L(start_loop): sub tmp, src, srcin + add tmp, tmp, 17 subs cntrem, cntin, tmp - b.ls L(nomatch) + b.lo L(nomatch) /* Make sure that it won't overread by a 16-byte chunk */ - add tmp, cntrem, 15 - tbnz tmp, 4, L(loop32_2) - + tbz cntrem, 4, L(loop32_2) + sub src, src, 16 .p2align 5 L(loop32): - ldr qdata, [src], 16 + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, 0 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbnz synd, L(end) L(loop32_2): - ldr qdata, [src], 16 + ldr qdata, [src, 16] subs cntrem, cntrem, 32 cmeq vhas_chr.16b, vdata.16b, 0 - b.ls L(end) + b.lo L(end_2) umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ fmov synd, dend cbz synd, L(loop32) - +L(end_2): + add src, src, 16 L(end): - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ - sub src, src, 16 - mov synd, vend.d[0] + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ sub result, src, srcin + fmov synd, dend #ifndef __AARCH64EB__ rbit synd, synd #endif @@ -104,9 +98,5 @@ L(end): csel result, cntin, result, ls ret -L(nomatch): - mov result, cntin - ret - END (__strnlen_aarch64) diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S index 1e4fb1a..bb61ab9 100644 --- a/string/aarch64/strrchr-mte.S +++ b/string/aarch64/strrchr-mte.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * MTE compatible. */ -#include "../asmdefs.h" +#include "asmdefs.h" #define srcin x0 #define chrin w1 @@ -19,7 +19,6 @@ #define src x2 #define tmp x3 -#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -31,7 +30,6 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -47,55 +45,67 @@ ENTRY (__strrchr_aarch64_mte) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - mov wtmp, 0x3003 - dup vrepmask.8h, wtmp - tst srcin, 15 - beq L(loop1) - - ld1 {vdata.16b}, [src], 16 + movi vrepmask.16b, 0x33 + ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp, 0xf00f - dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2) + cbnz synd, L(loop2_start) - .p2align 5 + .p2align 4 L(loop1): - ld1 {vdata.16b}, [src], 16 + ldr q1, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop1_end) + ldr q1, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - + sub src, src, 16 +L(loop1_end): + add src, src, 16 cmeq vhas_nul.16b, vdata.16b, 0 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + rbit synd, synd +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 fmov synd, dend +#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2) - + beq L(loop2_start) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - sub result, src, 1 + add result, src, 15 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 + nop + nop +L(loop2_start): + add src, src, 16 + bic vrepmask.8h, 0xf0 + L(loop2): cmp synd, 0 csel src_match, src, src_match, ne diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S index d36d69a..825a738 100644 --- a/string/aarch64/strrchr-sve.S +++ b/string/aarch64/strrchr-sve.S @@ -1,11 +1,11 @@ /* * strrchr - find the last of a character in a string * - * Copyright (c) 2019-2021, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ -#include "../asmdefs.h" +#include "asmdefs.h" #if __ARM_FEATURE_SVE /* Assumptions: diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S index 56185ff..bf9cb29 100644 --- a/string/aarch64/strrchr.S +++ b/string/aarch64/strrchr.S @@ -1,8 +1,8 @@ /* * strrchr - find last position of a character in a string. * - * Copyright (c) 2014-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2014-2022, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* Assumptions: @@ -11,7 +11,7 @@ * Neon Available. */ -#include "../asmdefs.h" +#include "asmdefs.h" /* Arguments and results. */ #define srcin x0 |