aboutsummaryrefslogtreecommitdiff
path: root/string/aarch64/strchrnul-mte.S
blob: 1b0d0a63094c6567c3ee3654b416635f28a8acfd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/*
 * strchrnul - find a character or nul in a string
 *
 * Copyright (c) 2020, Arm Limited.
 * SPDX-License-Identifier: MIT
 */

/* Assumptions:
 *
 * ARMv8-a, AArch64, Advanced SIMD.
 * MTE compatible.
 */

#include "../asmdefs.h"

#define srcin		x0
#define chrin		w1
#define result		x0

#define src		x2
#define tmp1		x1
#define tmp2		x3
#define tmp2w		w3

#define vrepchr		v0
#define vdata		v1
#define qdata		q1
#define vhas_nul	v2
#define vhas_chr	v3
#define vrepmask	v4
#define vend		v5
#define dend		d5

/* Core algorithm:

   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
   set likewise for odd bytes so that adjacent bytes can be merged. Since the
   bits in the syndrome reflect the order in which things occur in the original
   string, counting trailing zeros identifies exactly which byte matched.  */

ENTRY (__strchrnul_aarch64_mte)
	PTR_ARG (0)
	bic	src, srcin, 15
	dup	vrepchr.16b, chrin
	ld1	{vdata.16b}, [src]
	mov	tmp2w, 0xf00f
	dup	vrepmask.8h, tmp2w
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
	lsl	tmp2, srcin, 2
	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
	fmov	tmp1, dend
	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
	cbz	tmp1, L(loop)

	rbit	tmp1, tmp1
	clz	tmp1, tmp1
	add	result, srcin, tmp1, lsr 2
	ret

	.p2align 4
L(loop):
	ldr	qdata, [src, 16]!
	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
	fmov	tmp1, dend
	cbz	tmp1, L(loop)

	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
	fmov	tmp1, dend
#ifndef __AARCH64EB__
	rbit	tmp1, tmp1
#endif
	clz	tmp1, tmp1
	add	result, src, tmp1, lsr 2
	ret

END (__strchrnul_aarch64_mte)