1 files changed, 127 insertions, 0 deletions
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
new file mode 100644
index 0000000..1e4fb1a
--- /dev/null
+++ b/string/aarch64/strrchr-mte.S
@@ -0,0 +1,127 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#include "../asmdefs.h"
+
+#define srcin		x0
+#define chrin		w1
+#define result		x0
+
+#define src		x2
+#define tmp		x3
+#define wtmp		w3
+#define synd		x3
+#define shift		x4
+#define src_match	x4
+#define nul_match	x5
+#define chr_match	x6
+
+#define vrepchr		v0
+#define vdata		v1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vrepmask2	v5
+#define vend		v5
+#define dend		d5
+
+/* Core algorithm.
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value, with
+   four bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bits 0-1 are set if
+   the relevant byte matched the requested character; bits 2-3 are set
+   if the relevant byte matched the NUL end of string.  */
+
+ENTRY (__strrchr_aarch64_mte)
+	PTR_ARG (0)
+	bic	src, srcin, 15
+	dup	vrepchr.16b, chrin
+	mov	wtmp, 0x3003
+	dup	vrepmask.8h, wtmp
+	tst	srcin, 15
+	beq	L(loop1)
+
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	mov	wtmp, 0xf00f
+	dup	vrepmask2.8h, wtmp
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	lsl	shift, srcin, 2
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	lsl	synd, synd, shift
+	ands	nul_match, synd, 0xcccccccccccccccc
+	bne	L(tail)
+	cbnz	synd, L(loop2)
+
+	.p2align 5
+L(loop1):
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop1)
+
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	bic	vhas_nul.8h, 0x0f, lsl 8
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	ands	nul_match, synd, 0xcccccccccccccccc
+	beq	L(loop2)
+
+L(tail):
+	sub	nul_match, nul_match, 1
+	and	chr_match, synd, 0x3333333333333333
+	ands	chr_match, chr_match, nul_match
+	sub	result, src, 1
+	clz	tmp, chr_match
+	sub	result, result, tmp, lsr 2
+	csel	result, result, xzr, ne
+	ret
+
+	.p2align 4
+L(loop2):
+	cmp	synd, 0
+	csel	src_match, src, src_match, ne
+	csel	chr_match, synd, chr_match, ne
+	ld1	{vdata.16b}, [src], 16
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	tst	synd, 0xcccccccccccccccc
+	beq	L(loop2)
+
+	bic	vhas_nul.8h, 0x0f, lsl 8
+	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	and	nul_match, synd, 0xcccccccccccccccc
+	sub	nul_match, nul_match, 1
+	and	tmp, synd, 0x3333333333333333
+	ands	tmp, tmp, nul_match
+	csel	chr_match, tmp, chr_match, ne
+	csel	src_match, src, src_match, ne
+	sub	src_match, src_match, 1
+	clz	tmp, chr_match
+	sub	result, src_match, tmp, lsr 2
+	ret
+
+END (__strrchr_aarch64_mte)
+