aboutsummaryrefslogtreecommitdiff
path: root/string/aarch64/strchr-mte.S
blob: dcb0e46258709760e7ef1c7d81e47a86457a2846 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
 * strchr - find a character in a string
 *
 * Copyright (c) 2020, Arm Limited.
 * SPDX-License-Identifier: MIT
 */

/* Assumptions:
 *
 * ARMv8-a, AArch64, Advanced SIMD.
 * MTE compatible.
 */

#include "../asmdefs.h"

#define srcin		x0
#define chrin		w1
#define result		x0

#define src		x2
#define tmp1		x1
#define wtmp2		w3
#define tmp3		x3

#define vrepchr		v0
#define vdata		v1
#define qdata		q1
#define vhas_nul	v2
#define vhas_chr	v3
#define vrepmask	v4
#define vrepmask2	v5
#define vend		v6
#define dend		d6

/* Core algorithm.

   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
   per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
   requested character, bits 2-3 are set if the byte is NUL (or matched), and
   bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
   bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
   in the syndrome reflect the order in which things occur in the original
   string, counting trailing zeros identifies exactly which byte matched.  */

ENTRY (__strchr_aarch64_mte)
	PTR_ARG (0)
	bic	src, srcin, 15
	dup	vrepchr.16b, chrin
	ld1	{vdata.16b}, [src]
	mov	wtmp2, 0x3003
	dup	vrepmask.8h, wtmp2
	cmeq	vhas_nul.16b, vdata.16b, 0
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
	mov	wtmp2, 0xf00f
	dup	vrepmask2.8h, wtmp2

	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
	lsl	tmp3, srcin, 2
	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */

	fmov	tmp1, dend
	lsr	tmp1, tmp1, tmp3
	cbz	tmp1, L(loop)

	rbit	tmp1, tmp1
	clz	tmp1, tmp1
	/* Tmp1 is an even multiple of 2 if the target character was
	   found first. Otherwise we've found the end of string.  */
	tst	tmp1, 2
	add	result, srcin, tmp1, lsr 2
	csel	result, result, xzr, eq
	ret

	.p2align 4
L(loop):
	ldr	qdata, [src, 16]!
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
	cmhs	vhas_nul.16b, vhas_chr.16b, vdata.16b
	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
	fmov	tmp1, dend
	cbz	tmp1, L(loop)

#ifdef __AARCH64EB__
	bif	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
	fmov	tmp1, dend
#else
	bit	vhas_nul.16b, vhas_chr.16b, vrepmask.16b
	and	vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
	fmov	tmp1, dend
	rbit	tmp1, tmp1
#endif
	clz	tmp1, tmp1
	/* Tmp1 is an even multiple of 2 if the target character was
	   found first. Otherwise we've found the end of string.  */
	tst	tmp1, 2
	add	result, src, tmp1, lsr 2
	csel	result, result, xzr, eq
	ret

END (__strchr_aarch64_mte)