diff options
author | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2020-04-29 15:21:25 +0100 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2020-04-30 08:57:21 +0100 |
commit | 7aeabda56a8d93c5b1c3cd74b66ca5fe3e75d66a (patch) | |
tree | 152b17e4029c884921b5727d44e15f9b6649325c /string | |
parent | 2f12ab4a5c85b3f7cc061f486c86b2c5195d9fb6 (diff) | |
download | arm-optimized-routines-7aeabda56a8d93c5b1c3cd74b66ca5fe3e75d66a.tar.gz |
string: Improve strchr-mte performance
Improve strchr performance by using a more efficient termination test.
On various micro architectures the speedup is 19% on large strings and
19% on small strings.
Diffstat (limited to 'string')
-rw-r--r-- | string/aarch64/strchr-mte.S | 147 |
1 files changed, 60 insertions, 87 deletions
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S index db46c8f..4dfa31c 100644 --- a/string/aarch64/strchr-mte.S +++ b/string/aarch64/strchr-mte.S @@ -1,126 +1,99 @@ /* * strchr - find a character in a string * - * Copyright (c) 2014-2020, Arm Limited. + * Copyright (c) 2020, Arm Limited. * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64 - * Neon Available. + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ #include "../asmdefs.h" -/* Arguments and results. */ #define srcin x0 #define chrin w1 - #define result x0 #define src x2 -#define tmp1 x3 -#define wtmp2 w4 -#define tmp3 x5 +#define tmp1 x1 +#define wtmp2 w3 +#define tmp3 x3 #define vrepchr v0 -#define qdata q1 #define vdata v1 #define vhas_nul v2 #define vhas_chr v3 -#define vrepmask_0 v4 -#define vrepmask_c v5 +#define vrepmask v4 +#define vrepmask2 v5 #define vend v6 - -#define L(l) .L ## l +#define dend d6 /* Core algorithm. - For each 16-byte chunk we calculate a 64-bit syndrome value, with - four bits per byte (LSB is always in bits 0 and 1, for both big - and little-endian systems). For each tuple, bit 0 is set if - the relevant byte matched the requested character; bit 1 is set - if the relevant byte matched the NUL end of string (we trigger - off bit0 for the special case of looking for NUL) and bits 2 and 3 - are not used. - Since the bits in the syndrome reflect exactly the order in which - things occur in the original string a count_trailing_zeros() - operation will identify exactly which byte is causing the termination, - and why. */ - -/* Locals and temporaries. */ + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-1 are set if the relevant byte matched the + requested character, bits 2-3 are set if the byte is NUL (or matched), and + bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd + bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits + in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ ENTRY(__strchr_aarch64_mte) - /* Magic constant 0x10011001 to allow us to identify which lane - matches the requested byte. Magic constant 0x20022002 used - similarly for NUL termination. */ - mov wtmp2, #0x1001 - movk wtmp2, #0x1001, lsl #16 + bic src, srcin, 15 dup vrepchr.16b, chrin - bic src, srcin, #15 /* Work with aligned 16-byte chunks. */ - dup vrepmask_c.4s, wtmp2 - ands tmp1, srcin, #15 - add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ - b.eq L(loop) - - /* Input string is not 16-byte aligned. Rather than forcing - the padding bytes to a safe value, we calculate the syndrome - for all the bytes, but then mask off those bits of the - syndrome that are related to the padding. */ - ld1 {vdata.16b}, [src], #16 - cmeq vhas_nul.16b, vdata.16b, #0 + ld1 {vdata.16b}, [src] + mov wtmp2, 0x3003 + dup vrepmask.8h, wtmp2 + cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b - and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b - lsl tmp1, tmp1, #2 - orr vend.16b, vhas_nul.16b, vhas_chr.16b - mov tmp3, #~0 - addp vend.16b, vend.16b, vend.16b /* 128->64 */ - lsl tmp1, tmp3, tmp1 - - mov tmp3, vend.d[0] - ands tmp1, tmp3, tmp1 /* Mask padding bits. */ - b.ne L(tail) + mov wtmp2, 0xf00f + dup vrepmask2.8h, wtmp2 + + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + lsl tmp3, srcin, 2 + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + + fmov tmp1, dend + lsr tmp1, tmp1, tmp3 + cbz tmp1, L(loop1) + + rbit tmp1, tmp1 + clz tmp1, tmp1 + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ + tst tmp1, 2 + add result, srcin, tmp1, lsr 2 + csel result, result, xzr, eq + ret +L(loop1): + add src, src, 16 + + .p2align 4 L(loop): - ld1 {vdata.16b}, [src], #16 - cmeq vhas_nul.16b, vdata.16b, #0 - cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vend.16b, vhas_nul.16b, vhas_chr.16b - addp vend.16b, vend.16b, vend.16b /* 128->64 */ - mov tmp1, vend.d[0] - cbnz tmp1, L(end) - - ld1 {vdata.16b}, [src], #16 - cmeq vhas_nul.16b, vdata.16b, #0 + ld1 {vdata.16b}, [src], 16 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - /* Use a fast check for the termination condition. */ - orr vend.16b, vhas_nul.16b, vhas_chr.16b - addp vend.16b, vend.16b, vend.16b /* 128->64 */ - mov tmp1, vend.d[0] + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend cbz tmp1, L(loop) -L(end): - /* Termination condition found. Now need to establish exactly why - we terminated. */ - and vhas_nul.16b, vhas_nul.16b, vrepmask_0.16b - and vhas_chr.16b, vhas_chr.16b, vrepmask_c.16b - orr vend.16b, vhas_nul.16b, vhas_chr.16b - addp vend.16b, vend.16b, vend.16b /* 128->64 */ - - mov tmp1, vend.d[0] -L(tail): - /* Count the trailing zeros, by bit reversing... */ + sub src, src, 16 + bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b + bic vhas_nul.8h, 0x0f, lsl 8 + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + + fmov tmp1, dend rbit tmp1, tmp1 - /* Re-bias source. */ - sub src, src, #16 - clz tmp1, tmp1 /* And counting the leading zeros. */ - /* Tmp1 is even if the target character was found first. Otherwise - we've found the end of string and we weren't looking for NUL. */ - tst tmp1, #1 - add result, src, tmp1, lsr #2 + clz tmp1, tmp1 + /* Tmp1 is an even multiple of 2 if the target character was + found first. Otherwise we've found the end of string. */ + tst tmp1, 2 + add result, src, tmp1, lsr 2 csel result, result, xzr, eq ret |