diff options
author | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2020-05-01 09:21:24 +0100 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2020-05-01 09:22:51 +0100 |
commit | 7bb8464fd27e08fb9540589581644b53d876fe00 (patch) | |
tree | 659619ca189ec30de13cb234a9ecb9c265679bee /string/aarch64 | |
parent | 1de12a678fc768145c33a107c7bacb8b60f89490 (diff) | |
download | arm-optimized-routines-7bb8464fd27e08fb9540589581644b53d876fe00.tar.gz |
string: Further improve strchr-mte performance
Remove 2 more instructions, resulting in a 6.8% speedup of medium
sized strings (16-32).
The BTI patch changed ENTRY so the loops got misaligned, this fixes
that regression.
Diffstat (limited to 'string/aarch64')
-rw-r--r-- | string/aarch64/strchr-mte.S | 23 |
1 files changed, 13 insertions, 10 deletions
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S index 66c6c22..b9a5e71 100644 --- a/string/aarch64/strchr-mte.S +++ b/string/aarch64/strchr-mte.S @@ -24,6 +24,7 @@ #define vrepchr v0 #define vdata v1 +#define qdata q1 #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 @@ -41,7 +42,7 @@ in the syndrome reflect the order in which things occur in the original string, counting trailing zeros identifies exactly which byte matched. */ -ENTRY(__strchr_aarch64_mte) +ENTRY (__strchr_aarch64_mte) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] @@ -59,7 +60,7 @@ ENTRY(__strchr_aarch64_mte) fmov tmp1, dend lsr tmp1, tmp1, tmp3 - cbz tmp1, L(loop1) + cbz tmp1, L(loop) rbit tmp1, tmp1 clz tmp1, tmp1 @@ -70,25 +71,27 @@ ENTRY(__strchr_aarch64_mte) csel result, result, xzr, eq ret -L(loop1): - add src, src, 16 - .p2align 4 L(loop): - ld1 {vdata.16b}, [src], 16 + ldr qdata, [src, 16]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov tmp1, dend cbz tmp1, L(loop) - sub src, src, 16 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b + addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + fmov tmp1, dend +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 + and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - fmov tmp1, dend rbit tmp1, tmp1 +#endif clz tmp1, tmp1 /* Tmp1 is an even multiple of 2 if the target character was found first. Otherwise we've found the end of string. */ @@ -97,6 +100,6 @@ L(loop): csel result, result, xzr, eq ret -END(__strchr_aarch64_mte) +END (__strchr_aarch64_mte) END_FILE |