string: Optimize string functions with shrn instruction

Optimize __memchr_aarch64_mte __memrchr_aarch64 __strchrnul_aarch64_mte __stpcpy_aarch64 __strcpy_aarch64 __strlen_aarch64_mte using the shrn instruction for computing the nibble mask instead of and + addp, which reduces instruction count.
author: Danila Kutenin <danilak@google.com> 2022-06-22 12:37:06 +0100
committer: Szabolcs Nagy <szabolcs.nagy@arm.com> 2022-07-06 08:57:17 +0100
commit: 2be383412b3c5470d8d45f49d73f4a7c503022a2 (patch)
tree: 4c2fe2b02333eb326ab5ef50132e8ededecf685a /string
parent: 94ed5b928a498bccb72d582156bf0f39c104a481 (diff)
download: arm-optimized-routines-2be383412b3c5470d8d45f49d73f4a7c503022a2.tar.gz
5 files changed, 45 insertions, 82 deletions
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index 8441585..0f434cf 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -23,25 +23,21 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memchr_aarch64_mte)
 	PTR_ARG (0)
@@ -50,12 +46,9 @@ ENTRY (__memchr_aarch64_mte)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4            /* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -94,8 +87,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, srcin, cntin
 	sub	cntrem, tmp, src
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index ff4f47a..47fbce2 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -23,7 +23,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
-#define wtmp		w7
 #define end		x8
 #define endm1		x9
 
@@ -31,19 +30,16 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
-#define vend		v4
-#define dend		d4
+#define vend		v3
+#define dend		d3
 
 /*
    Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__memrchr_aarch64)
 	PTR_ARG (0)
@@ -53,12 +49,9 @@ ENTRY (__memrchr_aarch64)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
-	mov	wtmp, 0xf00f
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4            /* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@@ -95,8 +88,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	synd, dend
 
 	add	tmp, src, 15
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 4a08b52..9be5cbc 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -20,38 +20,32 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
-#define tmp2w		w3
 
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
-#define vend		v5
-#define dend		d5
+#define vend		v4
+#define dend		d4
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (__strchrnul_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
-	mov	tmp2w, 0xf00f
-	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@@ -70,8 +64,7 @@ L(loop):
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
-	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 92b2850..ba4a7d8 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -22,7 +22,6 @@
 #define len		x4
 #define synd		x4
 #define	tmp		x5
-#define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@@ -32,9 +31,8 @@
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 #define dataq2		q1
 
 #ifdef BUILD_STPCPY
@@ -45,34 +43,29 @@
 # define IFSTPCPY(X,...)
 #endif
 
-/* Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
-   bits in the syndrome reflect the order in which things occur in the original
-   string, counting trailing zeros identifies exactly which byte matched.  */
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
 
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	shrn	vend.8b, vhas_nul.8h, 4
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
 
@@ -144,8 +137,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 45103ff..0d33ebb 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -19,15 +19,13 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
-#define wtmp		w3
 #define shift		x4
 
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
-#define vend		v3
-#define dend		d3
+#define vend		v2
+#define dend		d2
 
 /* Core algorithm:
 
@@ -41,13 +39,10 @@
 ENTRY (__strlen_aarch64_mte)
 	PTR_ARG (0)
 	bic	src, srcin, 15
-	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
-	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@@ -65,8 +60,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
 
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
-	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
author	Danila Kutenin <danilak@google.com>	2022-06-22 12:37:06 +0100
committer	Szabolcs Nagy <szabolcs.nagy@arm.com>	2022-07-06 08:57:17 +0100
commit	2be383412b3c5470d8d45f49d73f4a7c503022a2 (patch)
tree	4c2fe2b02333eb326ab5ef50132e8ededecf685a /string
parent	94ed5b928a498bccb72d582156bf0f39c104a481 (diff)
download	arm-optimized-routines-2be383412b3c5470d8d45f49d73f4a7c503022a2.tar.gz