aboutsummaryrefslogtreecommitdiff
path: root/string
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2020-04-29 15:21:25 +0100
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2020-04-30 08:57:21 +0100
commit7aeabda56a8d93c5b1c3cd74b66ca5fe3e75d66a (patch)
tree152b17e4029c884921b5727d44e15f9b6649325c /string
parent2f12ab4a5c85b3f7cc061f486c86b2c5195d9fb6 (diff)
downloadarm-optimized-routines-7aeabda56a8d93c5b1c3cd74b66ca5fe3e75d66a.tar.gz
string: Improve strchr-mte performance
Improve strchr performance by using a more efficient termination test. On various micro architectures the speedup is 19% on large strings and 19% on small strings.
Diffstat (limited to 'string')
-rw-r--r--string/aarch64/strchr-mte.S147
1 files changed, 60 insertions, 87 deletions
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index db46c8f..4dfa31c 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,126 +1,99 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
#include "../asmdefs.h"
-/* Arguments and results. */
#define srcin x0
#define chrin w1
-
#define result x0
#define src x2
-#define tmp1 x3
-#define wtmp2 w4
-#define tmp3 x5
+#define tmp1 x1
+#define wtmp2 w3
+#define tmp3 x3
#define vrepchr v0
-#define qdata q1
#define vdata v1
#define vhas_nul v2
#define vhas_chr v3
-#define vrepmask_0 v4
-#define vrepmask_c v5
+#define vrepmask v4
+#define vrepmask2 v5
#define vend v6
-
-#define L(l) .L ## l
+#define dend d6
/* Core algorithm.
- For each 16-byte chunk we calculate a 64-bit syndrome value, with
- four bits per byte (LSB is always in bits 0 and 1, for both big
- and little-endian systems). For each tuple, bit 0 is set if
- the relevant byte matched the requested character; bit 1 is set
- if the relevant byte matched the NUL end of string (we trigger
- off bit0 for the special case of looking for NUL) and bits 2 and 3
- are not used.
- Since the bits in the syndrome reflect exactly the order in which
- things occur in the original string a count_trailing_zeros()
- operation will identify exactly which byte is causing the termination,
- and why. */
-
-/* Locals and temporaries. */
+ For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+ per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
+ requested character, bits 2-3 are set if the byte is NUL (or matched), and
+ bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
+ bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
+ in the syndrome reflect the order in which things occur in the original
+ string, counting trailing zeros identifies exactly which byte matched. */
ENTRY(__strchr_aarch64_mte)
- /* Magic constant 0x10011001 to allow us to identify which lane
- matches the requested byte. Magic constant 0x20022002 used
- similarly for NUL termination. */
- mov wtmp2, #0x1001
- movk wtmp2, #0x1001, lsl #16
+ bic src, srcin, 15
dup vrepchr.16b, chrin
- bic src, srcin, #15 /* Work with aligned 16-byte chunks. */
- dup vrepmask_c.4s, wtmp2
- ands tmp1, srcin, #15
- add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq L(loop)
-
- /* Input string is not 16-byte aligned. Rather than forcing
- the padding bytes to a safe value, we calculate the syndrome
- for all the bytes, but then mask off those bits of the
- syndrome that are related to the padding. */
- ld1 {vdata.16b}, [src], #16
- cmeq vhas_nul.16b, vdata.16b, #0
+ ld1 {vdata.16b}, [src]
+ mov wtmp2, 0x3003
+ dup vrepmask.8h, wtmp2
+ cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
- and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
- lsl tmp1, tmp1, #2
- orr vend.16b, vhas_nul.16b, vhas_chr.16b
- mov tmp3, #~0
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
- lsl tmp1, tmp3, tmp1
-
- mov tmp3, vend.d[0]
- ands tmp1, tmp3, tmp1 /* Mask padding bits. */
- b.ne L(tail)
+ mov wtmp2, 0xf00f
+ dup vrepmask2.8h, wtmp2
+
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ lsl tmp3, srcin, 2
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+
+ fmov tmp1, dend
+ lsr tmp1, tmp1, tmp3
+ cbz tmp1, L(loop1)
+
+ rbit tmp1, tmp1
+ clz tmp1, tmp1
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
+ tst tmp1, 2
+ add result, srcin, tmp1, lsr 2
+ csel result, result, xzr, eq
+ ret
+L(loop1):
+ add src, src, 16
+
+ .p2align 4
L(loop):
- ld1 {vdata.16b}, [src], #16
- cmeq vhas_nul.16b, vdata.16b, #0
- cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vend.16b, vhas_nul.16b, vhas_chr.16b
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
- mov tmp1, vend.d[0]
- cbnz tmp1, L(end)
-
- ld1 {vdata.16b}, [src], #16
- cmeq vhas_nul.16b, vdata.16b, #0
+ ld1 {vdata.16b}, [src], 16
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- /* Use a fast check for the termination condition. */
- orr vend.16b, vhas_nul.16b, vhas_chr.16b
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
- mov tmp1, vend.d[0]
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov tmp1, dend
cbz tmp1, L(loop)
-L(end):
- /* Termination condition found. Now need to establish exactly why
- we terminated. */
- and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
- and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
- orr vend.16b, vhas_nul.16b, vhas_chr.16b
- addp vend.16b, vend.16b, vend.16b /* 128->64 */
-
- mov tmp1, vend.d[0]
-L(tail):
- /* Count the trailing zeros, by bit reversing... */
+ sub src, src, 16
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ bic vhas_nul.8h, 0x0f, lsl 8
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+
+ fmov tmp1, dend
rbit tmp1, tmp1
- /* Re-bias source. */
- sub src, src, #16
- clz tmp1, tmp1 /* And counting the leading zeros. */
- /* Tmp1 is even if the target character was found first. Otherwise
- we've found the end of string and we weren't looking for NUL. */
- tst tmp1, #1
- add result, src, tmp1, lsr #2
+ clz tmp1, tmp1
+ /* Tmp1 is an even multiple of 2 if the target character was
+ found first. Otherwise we've found the end of string. */
+ tst tmp1, 2
+ add result, src, tmp1, lsr 2
csel result, result, xzr, eq
ret