/* * strrchr - find last position of a character in a string. * * Copyright (c) 2020, Arm Limited. * SPDX-License-Identifier: MIT */ /* Assumptions: * * ARMv8-a, AArch64, Advanced SIMD. * MTE compatible. */ #include "../asmdefs.h" #define srcin x0 #define chrin w1 #define result x0 #define src x2 #define tmp x3 #define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 #define nul_match x5 #define chr_match x6 #define vrepchr v0 #define vdata v1 #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 #define vrepmask2 v5 #define vend v5 #define dend d5 /* Core algorithm. For each 16-byte chunk we calculate a 64-bit syndrome value, with four bits per byte (LSB is always in bits 0 and 1, for both big and little-endian systems). For each tuple, bits 0-1 are set if the relevant byte matched the requested character; bits 2-3 are set if the relevant byte matched the NUL end of string. */ ENTRY (__strrchr_aarch64_mte) bic src, srcin, 15 dup vrepchr.16b, chrin mov wtmp, 0x3003 dup vrepmask.8h, wtmp tst srcin, 15 beq L(loop1) ld1 {vdata.16b}, [src], 16 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b mov wtmp, 0xf00f dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b addp vend.16b, vhas_nul.16b, vhas_nul.16b lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) cbnz synd, L(loop2) .p2align 5 L(loop1): ld1 {vdata.16b}, [src], 16 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) cmeq vhas_nul.16b, vdata.16b, 0 bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b bic vhas_nul.8h, 0x0f, lsl 8 addp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend ands nul_match, synd, 0xcccccccccccccccc beq L(loop2) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match sub result, src, 1 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 L(loop2): cmp synd, 0 csel src_match, src, src_match, ne csel chr_match, synd, chr_match, ne ld1 {vdata.16b}, [src], 16 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend tst synd, 0xcccccccccccccccc beq L(loop2) bic vhas_nul.8h, 0x0f, lsl 8 addp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend and nul_match, synd, 0xcccccccccccccccc sub nul_match, nul_match, 1 and tmp, synd, 0x3333333333333333 ands tmp, tmp, nul_match csel chr_match, tmp, chr_match, ne csel src_match, src, src_match, ne sub src_match, src_match, 1 clz tmp, chr_match sub result, src_match, tmp, lsr 2 ret END (__strrchr_aarch64_mte)