aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2020-05-20 17:00:43 +0100
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2020-05-20 18:18:32 +0100
commit0c9a5f3ef64fc801eb6d28971659867284c3000b (patch)
treef4e57c2378ee5fce601ef0d164d8c62fff4885b7
parenta99a1a9615b953b59e98fa22d780087a34a7e22b (diff)
downloadarm-optimized-routines-0c9a5f3ef64fc801eb6d28971659867284c3000b.tar.gz
string: Add optimized strcpy-mte and stpcpy-mte
Add optimized MTE-compatible strcpy-mte and stpcpy-mte. On various micro architectures the speedup over the non-MTE version is 53% on large strings and 20-60% on small strings.
-rw-r--r--string/aarch64/stpcpy-mte.S10
-rw-r--r--string/aarch64/strcpy-mte.S159
-rw-r--r--string/include/stringlib.h2
-rw-r--r--string/test/stpcpy.c1
-rw-r--r--string/test/strcpy.c1
5 files changed, 173 insertions, 0 deletions
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
new file mode 100644
index 0000000..f1c7119
--- /dev/null
+++ b/string/aarch64/stpcpy-mte.S
@@ -0,0 +1,10 @@
+/*
+ * stpcpy - copy a string returning pointer to end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define BUILD_STPCPY 1
+
+#include "strcpy-mte.S"
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
new file mode 100644
index 0000000..7c8629e
--- /dev/null
+++ b/string/aarch64/strcpy-mte.S
@@ -0,0 +1,159 @@
+/*
+ * strcpy/stpcpy - copy a string returning pointer to start/end.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define dstin x0
+#define srcin x1
+#define result x0
+
+#define src x2
+#define dst x3
+#define len x4
+#define synd x4
+#define tmp x5
+#define wtmp w5
+#define shift x5
+#define data1 x6
+#define dataw1 w6
+#define data2 x7
+#define dataw2 w7
+
+#define dataq q0
+#define vdata v0
+#define vhas_nul v1
+#define vrepmask v2
+#define vend v3
+#define dend d3
+#define dataq2 q1
+
+#ifdef BUILD_STPCPY
+# define STRCPY __stpcpy_aarch64_mte
+# define IFSTPCPY(X,...) X,__VA_ARGS__
+#else
+# define STRCPY __strcpy_aarch64_mte
+# define IFSTPCPY(X,...)
+#endif
+
+/* Core algorithm:
+
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+ requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+ set likewise for odd bytes so that adjacent bytes can be merged. Since the
+ bits in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
+
+ENTRY (STRCPY)
+ bic src, srcin, 15
+ mov wtmp, 0xf00f
+ ld1 {vdata.16b}, [src]
+ dup vrepmask.8h, wtmp
+ cmeq vhas_nul.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbnz synd, L(tail)
+
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub tmp, src, srcin
+ clz len, synd
+ add len, tmp, len, lsr 2
+ tbz len, 4, L(less16)
+ sub tmp, len, 15
+ ldr dataq, [srcin]
+ ldr dataq2, [srcin, tmp]
+ str dataq, [dstin]
+ str dataq2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4,,8
+L(tail):
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 2
+
+ .p2align 4
+L(less16):
+ tbz len, 3, L(less8)
+ sub tmp, len, 7
+ ldr data1, [srcin]
+ ldr data2, [srcin, tmp]
+ str data1, [dstin]
+ str data2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4
+L(less8):
+ subs tmp, len, 3
+ b.lo L(less4)
+ ldr dataw1, [srcin]
+ ldr dataw2, [srcin, tmp]
+ str dataw1, [dstin]
+ str dataw2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+L(less4):
+ cbz len, L(zerobyte)
+ ldrh dataw1, [srcin]
+ strh dataw1, [dstin]
+L(zerobyte):
+ strb wzr, [dstin, len]
+ IFSTPCPY (add result, dstin, len)
+ ret
+
+ .p2align 4
+L(start_loop):
+ sub len, src, srcin
+ ldr dataq2, [srcin]
+ add dst, dstin, len
+ str dataq2, [dstin]
+
+ .p2align 5
+L(loop):
+ str dataq, [dst], 16
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop)
+
+ and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ fmov synd, dend
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz len, synd
+ lsr len, len, 2
+ sub tmp, len, 15
+ ldr dataq, [src, tmp]
+ str dataq, [dst, tmp]
+ IFSTPCPY (add result, dst, len)
+ ret
+
+END (STRCPY)
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 0e18237..841a7bb 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -29,6 +29,8 @@ size_t __strlen_aarch64 (const char *);
size_t __strnlen_aarch64 (const char *, size_t);
int __strncmp_aarch64 (const char *, const char *, size_t);
void * __memchr_aarch64_mte (const void *, int, size_t);
+char *__strcpy_aarch64_mte (char *__restrict, const char *__restrict);
+char *__stpcpy_aarch64_mte (char *__restrict, const char *__restrict);
char *__strchr_aarch64_mte (const char *, int);
char * __strchrnul_aarch64_mte (const char *, int );
size_t __strlen_aarch64_mte (const char *);
diff --git a/string/test/stpcpy.c b/string/test/stpcpy.c
index de17df9..de51ad8 100644
--- a/string/test/stpcpy.c
+++ b/string/test/stpcpy.c
@@ -24,6 +24,7 @@ static const struct fun
F(stpcpy)
#if __aarch64__
F(__stpcpy_aarch64)
+ F(__stpcpy_aarch64_mte)
# if __ARM_FEATURE_SVE
F(__stpcpy_aarch64_sve)
# endif
diff --git a/string/test/strcpy.c b/string/test/strcpy.c
index 753203f..b751661 100644
--- a/string/test/strcpy.c
+++ b/string/test/strcpy.c
@@ -23,6 +23,7 @@ static const struct fun
F(strcpy)
#if __aarch64__
F(__strcpy_aarch64)
+ F(__strcpy_aarch64_mte)
# if __ARM_FEATURE_SVE
F(__strcpy_aarch64_sve)
# endif