diff options
Diffstat (limited to 'string/aarch64/strcpy-mte.S')
-rw-r--r-- | string/aarch64/strcpy-mte.S | 161 |
1 files changed, 0 insertions, 161 deletions
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S deleted file mode 100644 index 88c222d..0000000 --- a/string/aarch64/strcpy-mte.S +++ /dev/null @@ -1,161 +0,0 @@ -/* - * strcpy/stpcpy - copy a string returning pointer to start/end. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -/* Assumptions: - * - * ARMv8-a, AArch64, Advanced SIMD. - * MTE compatible. - */ - -#include "../asmdefs.h" - -#define dstin x0 -#define srcin x1 -#define result x0 - -#define src x2 -#define dst x3 -#define len x4 -#define synd x4 -#define tmp x5 -#define wtmp w5 -#define shift x5 -#define data1 x6 -#define dataw1 w6 -#define data2 x7 -#define dataw2 w7 - -#define dataq q0 -#define vdata v0 -#define vhas_nul v1 -#define vrepmask v2 -#define vend v3 -#define dend d3 -#define dataq2 q1 - -#ifdef BUILD_STPCPY -# define STRCPY __stpcpy_aarch64_mte -# define IFSTPCPY(X,...) X,__VA_ARGS__ -#else -# define STRCPY __strcpy_aarch64_mte -# define IFSTPCPY(X,...) -#endif - -/* Core algorithm: - - For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-3 are set if the relevant byte matched the - requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are - set likewise for odd bytes so that adjacent bytes can be merged. Since the - bits in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ - -ENTRY (STRCPY) - PTR_ARG (0) - PTR_ARG (1) - bic src, srcin, 15 - mov wtmp, 0xf00f - ld1 {vdata.16b}, [src] - dup vrepmask.8h, wtmp - cmeq vhas_nul.16b, vdata.16b, 0 - lsl shift, srcin, 2 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - lsr synd, synd, shift - cbnz synd, L(tail) - - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(start_loop) - -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - sub tmp, src, srcin - clz len, synd - add len, tmp, len, lsr 2 - tbz len, 4, L(less16) - sub tmp, len, 15 - ldr dataq, [srcin] - ldr dataq2, [srcin, tmp] - str dataq, [dstin] - str dataq2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4,,8 -L(tail): - rbit synd, synd - clz len, synd - lsr len, len, 2 - - .p2align 4 -L(less16): - tbz len, 3, L(less8) - sub tmp, len, 7 - ldr data1, [srcin] - ldr data2, [srcin, tmp] - str data1, [dstin] - str data2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(less8): - subs tmp, len, 3 - b.lo L(less4) - ldr dataw1, [srcin] - ldr dataw2, [srcin, tmp] - str dataw1, [dstin] - str dataw2, [dstin, tmp] - IFSTPCPY (add result, dstin, len) - ret - -L(less4): - cbz len, L(zerobyte) - ldrh dataw1, [srcin] - strh dataw1, [dstin] -L(zerobyte): - strb wzr, [dstin, len] - IFSTPCPY (add result, dstin, len) - ret - - .p2align 4 -L(start_loop): - sub len, src, srcin - ldr dataq2, [srcin] - add dst, dstin, len - str dataq2, [dstin] - - .p2align 5 -L(loop): - str dataq, [dst], 16 - ldr dataq, [src, 16]! - cmeq vhas_nul.16b, vdata.16b, 0 - umaxp vend.16b, vhas_nul.16b, vhas_nul.16b - fmov synd, dend - cbz synd, L(loop) - - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - fmov synd, dend -#ifndef __AARCH64EB__ - rbit synd, synd -#endif - clz len, synd - lsr len, len, 2 - sub tmp, len, 15 - ldr dataq, [src, tmp] - str dataq, [dst, tmp] - IFSTPCPY (add result, dst, len) - ret - -END (STRCPY) |