aboutsummaryrefslogtreecommitdiff
path: root/string
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2020-05-18 16:19:51 +0100
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2020-05-18 16:19:51 +0100
commita99a1a9615b953b59e98fa22d780087a34a7e22b (patch)
tree40c7fc508c94f16ac7fefb9aa975e8c8e43d2bfc /string
parent98e4d6a5c13c8e542c56f26dc48171205d3923b3 (diff)
downloadarm-optimized-routines-a99a1a9615b953b59e98fa22d780087a34a7e22b.tar.gz
string: Improve strrchr-mte performance
Improve strrchr performance by using a fast strchr loop to find the first match. On various micro architectures the speedup is 30-80% on large strings and 32% on small strings.
Diffstat (limited to 'string')
-rw-r--r--string/aarch64/strrchr-mte.S179
1 files changed, 86 insertions, 93 deletions
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index dc0abdc..5a409b9 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -1,132 +1,125 @@
/*
* strrchr - find last position of a character in a string.
*
- * Copyright (c) 2014-2020, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
#include "../asmdefs.h"
-/* Arguments and results. */
#define srcin x0
#define chrin w1
-
#define result x0
#define src x2
-#define tmp1 x3
-#define wtmp2 w4
-#define tmp3 x5
-#define src_match x6
-#define src_offset x7
-#define const_m1 x8
-#define tmp4 x9
-#define nul_match x10
-#define chr_match x11
+#define tmp x3
+#define wtmp w3
+#define synd x3
+#define shift x4
+#define src_match x4
+#define nul_match x5
+#define chr_match x6
#define vrepchr v0
#define vdata v1
#define vhas_nul v2
#define vhas_chr v3
-#define vrepmask_0 v4
-#define vrepmask_c v16
-#define vend v17
+#define vrepmask v4
+#define vrepmask2 v5
+#define vend v5
+#define dend d5
/* Core algorithm.
For each 16-byte chunk we calculate a 64-bit syndrome value, with
four bits per byte (LSB is always in bits 0 and 1, for both big
- and little-endian systems). For each tuple, bit 0 is set if
- the relevant byte matched the requested character; bit 1 is set
- if the relevant byte matched the NUL end of string (we trigger
- off bit0 for the special case of looking for NUL) and bits 2 and 3
- are not used.
- Since the bits in the syndrome reflect exactly the order in which
- things occur in the original string a count_trailing_zeros()
- operation will identify exactly which byte is causing the termination,
- and why. */
+ and little-endian systems). For each tuple, bits 0-1 are set if
+ the relevant byte matched the requested character; bits 2-3 are set
+ if the relevant byte matched the NUL end of string. */
ENTRY (__strrchr_aarch64_mte)
- /* Magic constant 0x10011001 to allow us to identify which lane
- matches the requested byte. Magic constant 0x20022002 used
- similarly for NUL termination. */
- mov wtmp2, #0x1001
- movk wtmp2, #0x1001, lsl #16
+ bic src, srcin, 15
dup vrepchr.16b, chrin
- bic src, srcin, #15 /* Work with aligned 16-byte chunks. */
- dup vrepmask_c.4s, wtmp2
- mov src_offset, #0
- ands tmp1, srcin, #15
- add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq L(aligned)
-
- /* Input string is not 16-byte aligned. Rather than forcing
- the padding bytes to a safe value, we calculate the syndrome
- for all the bytes, but then mask off those bits of the
- syndrome that are related to the padding. */
- ld1 {vdata.16b}, [src], #16
- neg tmp1, tmp1
- cmeq vhas_nul.16b, vdata.16b, #0
+ mov wtmp, 0x3003
+ dup vrepmask.8h, wtmp
+ tst srcin, 15
+ beq L(loop1)
+
+ ld1 {vdata.16b}, [src], 16
+ cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
- and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
- addp vhas_nul.16b, vhas_nul.16b, vhas_nul.16b // 128->64
- addp vhas_chr.16b, vhas_chr.16b, vhas_chr.16b // 128->64
- mov nul_match, vhas_nul.d[0]
- lsl tmp1, tmp1, #2
- mov const_m1, #~0
- mov chr_match, vhas_chr.d[0]
- lsr tmp3, const_m1, tmp1
-
- bic nul_match, nul_match, tmp3 // Mask padding bits.
- bic chr_match, chr_match, tmp3 // Mask padding bits.
- cbnz nul_match, L(tail)
-
-L(loop):
- cmp chr_match, #0
- csel src_match, src, src_match, ne
- csel src_offset, chr_match, src_offset, ne
-L(aligned):
- ld1 {vdata.16b}, [src], #16
- cmeq vhas_nul.16b, vdata.16b, #0
+ mov wtmp, 0xf00f
+ dup vrepmask2.8h, wtmp
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ lsl shift, srcin, 2
+ fmov synd, dend
+ lsr synd, synd, shift
+ lsl synd, synd, shift
+ ands nul_match, synd, 0xcccccccccccccccc
+ bne L(tail)
+ cbnz synd, L(loop2)
+
+ .p2align 5
+L(loop1):
+ ld1 {vdata.16b}, [src], 16
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b // 128->64
- and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b
- addp vhas_chr.16b, vhas_chr.16b, vhas_chr.16b // 128->64
- mov nul_match, vend.d[0]
- mov chr_match, vhas_chr.d[0]
- cbz nul_match, L(loop)
-
- and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b
- addp vhas_nul.16b, vhas_nul.16b, vhas_nul.16b
- mov nul_match, vhas_nul.d[0]
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop1)
+
+ cmeq vhas_nul.16b, vdata.16b, 0
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ bic vhas_nul.8h, 0x0f, lsl 8
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ ands nul_match, synd, 0xcccccccccccccccc
+ beq L(loop2)
L(tail):
- /* Work out exactly where the string ends. */
- sub tmp4, nul_match, #1
- eor tmp4, tmp4, nul_match
- ands chr_match, chr_match, tmp4
- /* And pick the values corresponding to the last match. */
- csel src_match, src, src_match, ne
- csel src_offset, chr_match, src_offset, ne
-
- /* Count down from the top of the syndrome to find the last match. */
- clz tmp3, src_offset
- /* Src_match points beyond the word containing the match, so we can
- simply subtract half the bit-offset into the syndrome. Because
- we are counting down, we need to go back one more character. */
- add tmp3, tmp3, #2
- sub result, src_match, tmp3, lsr #2
- /* But if the syndrome shows no match was found, then return NULL. */
- cmp src_offset, #0
+ sub nul_match, nul_match, 1
+ and chr_match, synd, 0x3333333333333333
+ ands chr_match, chr_match, nul_match
+ sub result, src, 1
+ clz tmp, chr_match
+ sub result, result, tmp, lsr 2
csel result, result, xzr, ne
+ ret
+ .p2align 4
+L(loop2):
+ cmp synd, 0
+ csel src_match, src, src_match, ne
+ csel chr_match, synd, chr_match, ne
+ ld1 {vdata.16b}, [src], 16
+ cmeq vhas_nul.16b, vdata.16b, 0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ tst synd, 0xcccccccccccccccc
+ beq L(loop2)
+
+ bic vhas_nul.8h, 0x0f, lsl 8
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ and nul_match, synd, 0xcccccccccccccccc
+ sub nul_match, nul_match, 1
+ and tmp, synd, 0x3333333333333333
+ ands tmp, tmp, nul_match
+ csel chr_match, tmp, chr_match, ne
+ csel src_match, src, src_match, ne
+ sub src_match, src_match, 1
+ clz tmp, chr_match
+ sub result, src_match, tmp, lsr 2
ret
END (__strrchr_aarch64_mte)