From 0c9a5f3ef64fc801eb6d28971659867284c3000b Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Wed, 20 May 2020 17:00:43 +0100 Subject: string: Add optimized strcpy-mte and stpcpy-mte Add optimized MTE-compatible strcpy-mte and stpcpy-mte. On various micro architectures the speedup over the non-MTE version is 53% on large strings and 20-60% on small strings. --- string/aarch64/stpcpy-mte.S | 10 +++ string/aarch64/strcpy-mte.S | 159 ++++++++++++++++++++++++++++++++++++++++++++ string/include/stringlib.h | 2 + string/test/stpcpy.c | 1 + string/test/strcpy.c | 1 + 5 files changed, 173 insertions(+) create mode 100644 string/aarch64/stpcpy-mte.S create mode 100644 string/aarch64/strcpy-mte.S diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S new file mode 100644 index 0000000..f1c7119 --- /dev/null +++ b/string/aarch64/stpcpy-mte.S @@ -0,0 +1,10 @@ +/* + * stpcpy - copy a string returning pointer to end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define BUILD_STPCPY 1 + +#include "strcpy-mte.S" diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S new file mode 100644 index 0000000..7c8629e --- /dev/null +++ b/string/aarch64/strcpy-mte.S @@ -0,0 +1,159 @@ +/* + * strcpy/stpcpy - copy a string returning pointer to start/end. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. + */ + +#include "../asmdefs.h" + +#define dstin x0 +#define srcin x1 +#define result x0 + +#define src x2 +#define dst x3 +#define len x4 +#define synd x4 +#define tmp x5 +#define wtmp w5 +#define shift x5 +#define data1 x6 +#define dataw1 w6 +#define data2 x7 +#define dataw2 w7 + +#define dataq q0 +#define vdata v0 +#define vhas_nul v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 +#define dataq2 q1 + +#ifdef BUILD_STPCPY +# define STRCPY __stpcpy_aarch64_mte +# define IFSTPCPY(X,...) X,__VA_ARGS__ +#else +# define STRCPY __strcpy_aarch64_mte +# define IFSTPCPY(X,...) +#endif + +/* Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (STRCPY) + bic src, srcin, 15 + mov wtmp, 0xf00f + ld1 {vdata.16b}, [src] + dup vrepmask.8h, wtmp + cmeq vhas_nul.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + lsr synd, synd, shift + cbnz synd, L(tail) + + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(start_loop) + +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + sub tmp, src, srcin + clz len, synd + add len, tmp, len, lsr 2 + tbz len, 4, L(less16) + sub tmp, len, 15 + ldr dataq, [srcin] + ldr dataq2, [srcin, tmp] + str dataq, [dstin] + str dataq2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4,,8 +L(tail): + rbit synd, synd + clz len, synd + lsr len, len, 2 + + .p2align 4 +L(less16): + tbz len, 3, L(less8) + sub tmp, len, 7 + ldr data1, [srcin] + ldr data2, [srcin, tmp] + str data1, [dstin] + str data2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(less8): + subs tmp, len, 3 + b.lo L(less4) + ldr dataw1, [srcin] + ldr dataw2, [srcin, tmp] + str dataw1, [dstin] + str dataw2, [dstin, tmp] + IFSTPCPY (add result, dstin, len) + ret + +L(less4): + cbz len, L(zerobyte) + ldrh dataw1, [srcin] + strh dataw1, [dstin] +L(zerobyte): + strb wzr, [dstin, len] + IFSTPCPY (add result, dstin, len) + ret + + .p2align 4 +L(start_loop): + sub len, src, srcin + ldr dataq2, [srcin] + add dst, dstin, len + str dataq2, [dstin] + + .p2align 5 +L(loop): + str dataq, [dst], 16 + ldr dataq, [src, 16]! + cmeq vhas_nul.16b, vdata.16b, 0 + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbz synd, L(loop) + + and vhas_nul.16b, vhas_nul.16b, vrepmask.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov synd, dend +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz len, synd + lsr len, len, 2 + sub tmp, len, 15 + ldr dataq, [src, tmp] + str dataq, [dst, tmp] + IFSTPCPY (add result, dst, len) + ret + +END (STRCPY) diff --git a/string/include/stringlib.h b/string/include/stringlib.h index 0e18237..841a7bb 100644 --- a/string/include/stringlib.h +++ b/string/include/stringlib.h @@ -29,6 +29,8 @@ size_t __strlen_aarch64 (const char *); size_t __strnlen_aarch64 (const char *, size_t); int __strncmp_aarch64 (const char *, const char *, size_t); void * __memchr_aarch64_mte (const void *, int, size_t); +char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict); +char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict); char *__strchr_aarch64_mte (const char *, int); char * __strchrnul_aarch64_mte (const char *, int ); size_t __strlen_aarch64_mte (const char *); diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c index de17df9..de51ad8 100644 --- a/string/test/stpcpy.c +++ b/string/test/stpcpy.c @@ -24,6 +24,7 @@ static const struct fun F(stpcpy) #if __aarch64__ F(__stpcpy_aarch64) + F(__stpcpy_aarch64_mte) # if __ARM_FEATURE_SVE F(__stpcpy_aarch64_sve) # endif diff --git a/string/test/strcpy.c b/string/test/strcpy.c index 753203f..b751661 100644 --- a/string/test/strcpy.c +++ b/string/test/strcpy.c @@ -23,6 +23,7 @@ static const struct fun F(strcpy) #if __aarch64__ F(__strcpy_aarch64) + F(__strcpy_aarch64_mte) # if __ARM_FEATURE_SVE F(__strcpy_aarch64_sve) # endif -- cgit v1.2.3