aboutsummaryrefslogtreecommitdiff
path: root/string/aarch64
diff options
context:
space:
mode:
Diffstat (limited to 'string/aarch64')
-rw-r--r--string/aarch64/__mtag_tag_region.S6
-rw-r--r--string/aarch64/__mtag_tag_zero_region.S6
-rw-r--r--string/aarch64/asmdefs.h92
-rw-r--r--string/aarch64/check-arch.S6
-rw-r--r--string/aarch64/memchr-mte.S58
-rw-r--r--string/aarch64/memchr-sve.S6
-rw-r--r--string/aarch64/memchr.S6
-rw-r--r--string/aarch64/memcmp-sve.S6
-rw-r--r--string/aarch64/memcmp.S239
-rw-r--r--string/aarch64/memcpy-advsimd.S6
-rw-r--r--string/aarch64/memcpy-sve.S177
-rw-r--r--string/aarch64/memcpy.S6
-rw-r--r--string/aarch64/memrchr.S51
-rw-r--r--string/aarch64/memset.S6
-rw-r--r--string/aarch64/stpcpy-mte.S10
-rw-r--r--string/aarch64/stpcpy-sve.S2
-rw-r--r--string/aarch64/stpcpy.S2
-rw-r--r--string/aarch64/strchr-mte.S58
-rw-r--r--string/aarch64/strchr-sve.S6
-rw-r--r--string/aarch64/strchr.S6
-rw-r--r--string/aarch64/strchrnul-mte.S47
-rw-r--r--string/aarch64/strchrnul-sve.S2
-rw-r--r--string/aarch64/strchrnul.S6
-rw-r--r--string/aarch64/strcmp-mte.S189
-rw-r--r--string/aarch64/strcmp-sve.S6
-rw-r--r--string/aarch64/strcmp.S238
-rw-r--r--string/aarch64/strcpy-mte.S161
-rw-r--r--string/aarch64/strcpy-sve.S6
-rw-r--r--string/aarch64/strcpy.S395
-rw-r--r--string/aarch64/strlen-mte.S41
-rw-r--r--string/aarch64/strlen-sve.S6
-rw-r--r--string/aarch64/strlen.S21
-rw-r--r--string/aarch64/strncmp-mte.S307
-rw-r--r--string/aarch64/strncmp-sve.S6
-rw-r--r--string/aarch64/strncmp.S238
-rw-r--r--string/aarch64/strnlen-sve.S6
-rw-r--r--string/aarch64/strnlen.S60
-rw-r--r--string/aarch64/strrchr-mte.S58
-rw-r--r--string/aarch64/strrchr-sve.S6
-rw-r--r--string/aarch64/strrchr.S6
40 files changed, 1050 insertions, 1510 deletions
diff --git a/string/aarch64/__mtag_tag_region.S b/string/aarch64/__mtag_tag_region.S
index 84339f7..207e229 100644
--- a/string/aarch64/__mtag_tag_region.S
+++ b/string/aarch64/__mtag_tag_region.S
@@ -1,8 +1,8 @@
/*
* __mtag_tag_region - tag memory
*
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -15,7 +15,7 @@
* The memory region may remain untagged if tagging is not enabled.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_MEMORY_TAGGING
diff --git a/string/aarch64/__mtag_tag_zero_region.S b/string/aarch64/__mtag_tag_zero_region.S
index f58364c..44b8e01 100644
--- a/string/aarch64/__mtag_tag_zero_region.S
+++ b/string/aarch64/__mtag_tag_zero_region.S
@@ -1,8 +1,8 @@
/*
* __mtag_tag_zero_region - tag memory and fill it with zero bytes
*
- * Copyright (c) 2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2021-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -15,7 +15,7 @@
* The memory region may remain untagged if tagging is not enabled.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_MEMORY_TAGGING
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
new file mode 100644
index 0000000..069b146
--- /dev/null
+++ b/string/aarch64/asmdefs.h
@@ -0,0 +1,92 @@
+/*
+ * Macros for asm code. AArch64 version.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+/* Branch Target Identitication support. */
+#define BTI_C hint 34
+#define BTI_J hint 36
+/* Return address signing support (pac-ret). */
+#define PACIASP hint 25; .cfi_window_save
+#define AUTIASP hint 29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .word 0; \
+ .text
+
+/* If set then the GNU Property Note section will be added to
+ mark objects to support BTI and PAC-RET. */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files. */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc; \
+ BTI_C;
+
+#define ENTRY(name) ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name) \
+ .global name; \
+ .type name,%function; \
+ name:
+
+#define END(name) \
+ .cfi_endproc; \
+ .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n) mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n) mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+/* Compiler supports SVE instructions */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+# define HAVE_SVE 1
+# else
+# define HAVE_SVE 0
+# endif
+#endif
+
+#endif
diff --git a/string/aarch64/check-arch.S b/string/aarch64/check-arch.S
index 5a54242..131b7fa 100644
--- a/string/aarch64/check-arch.S
+++ b/string/aarch64/check-arch.S
@@ -1,8 +1,8 @@
/*
* check ARCH setting.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#if !__aarch64__
@@ -10,4 +10,4 @@
#endif
/* Include for GNU property notes. */
-#include "../asmdefs.h"
+#include "asmdefs.h"
diff --git a/string/aarch64/memchr-mte.S b/string/aarch64/memchr-mte.S
index c2e967d..948c3cb 100644
--- a/string/aarch64/memchr-mte.S
+++ b/string/aarch64/memchr-mte.S
@@ -1,8 +1,8 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -23,25 +23,21 @@
#define synd x5
#define shift x6
#define tmp x7
-#define wtmp w7
#define vrepchr v0
#define qdata q1
#define vdata v1
#define vhas_chr v2
-#define vrepmask v3
-#define vend v4
-#define dend d4
+#define vend v3
+#define dend d3
/*
Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__memchr_aarch64_mte)
PTR_ARG (0)
@@ -50,55 +46,53 @@ ENTRY (__memchr_aarch64_mte)
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
- mov wtmp, 0xf00f
- dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
lsl shift, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
rbit synd, synd
clz synd, synd
- add result, srcin, synd, lsr 2
cmp cntin, synd, lsr 2
+ add result, srcin, synd, lsr 2
csel result, result, xzr, hi
ret
+ .p2align 3
L(start_loop):
sub tmp, src, srcin
- add tmp, tmp, 16
+ add tmp, tmp, 17
subs cntrem, cntin, tmp
- b.ls L(nomatch)
+ b.lo L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
+ tbz cntrem, 4, L(loop32_2)
+ sub src, src, 16
.p2align 4
L(loop32):
- ldr qdata, [src, 16]!
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src, 16]!
- subs cntrem, cntrem, 32
+ ldr qdata, [src, 16]
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- b.ls L(end)
+ subs cntrem, cntrem, 32
+ b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
+L(end_2):
+ add src, src, 16
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ sub cntrem, src, srcin
fmov synd, dend
- add tmp, srcin, cntin
- sub cntrem, tmp, src
+ sub cntrem, cntin, cntrem
#ifndef __AARCH64EB__
rbit synd, synd
#endif
diff --git a/string/aarch64/memchr-sve.S b/string/aarch64/memchr-sve.S
index c22e659..b851cf3 100644
--- a/string/aarch64/memchr-sve.S
+++ b/string/aarch64/memchr-sve.S
@@ -1,11 +1,11 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index 353f0d1..fe6cfe2 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -1,8 +1,8 @@
/*
* memchr - find a character in a memory zone
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/string/aarch64/memcmp-sve.S b/string/aarch64/memcmp-sve.S
index 78c5eca..d52ce45 100644
--- a/string/aarch64/memcmp-sve.S
+++ b/string/aarch64/memcmp-sve.S
@@ -1,11 +1,11 @@
/*
* memcmp - compare memory
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 3b10266..35135e7 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -1,103 +1,84 @@
/* memcmp - compare memory
*
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
-/* Parameters and result. */
-#define src1 x0
-#define src2 x1
-#define limit x2
-#define result w0
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
+
+#define data1 x3
+#define data1w w3
+#define data2 x4
+#define data2w w4
+#define data3 x5
+#define data3w w5
+#define data4 x6
+#define data4w w6
+#define tmp x6
+#define src1end x7
+#define src2end x8
-/* Internal variables. */
-#define data1 x3
-#define data1w w3
-#define data1h x4
-#define data2 x5
-#define data2w w5
-#define data2h x6
-#define tmp1 x7
-#define tmp2 x8
ENTRY (__memcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
- subs limit, limit, 8
- b.lo L(less8)
-
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- cmp data1, data2
- b.ne L(return)
-
- subs limit, limit, 8
- b.gt L(more16)
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- b L(return)
-
-L(more16):
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- cmp data1, data2
- bne L(return)
-
- /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
- strings. */
- subs limit, limit, 16
+ cmp limit, 16
+ b.lo L(less16)
+ ldp data1, data3, [src1]
+ ldp data2, data4, [src2]
+ ccmp data1, data2, 0, ne
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+
+ add src1end, src1, limit
+ add src2end, src2, limit
+ cmp limit, 32
b.ls L(last_bytes)
+ cmp limit, 160
+ b.hs L(loop_align)
+ sub limit, limit, 32
- /* We overlap loads between 0-32 bytes at either side of SRC1 when we
- try to align, so limit it only to strings larger than 128 bytes. */
- cmp limit, 96
- b.ls L(loop16)
-
- /* Align src1 and adjust src2 with bytes not yet done. */
- and tmp1, src1, 15
- add limit, limit, tmp1
- sub src1, src1, tmp1
- sub src2, src2, tmp1
-
- /* Loop performing 16 bytes per iteration using aligned src1.
- Limit is pre-decremented by 16 and must be larger than zero.
- Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
-L(loop16):
- ldp data1, data1h, [src1], 16
- ldp data2, data2h, [src2], 16
- subs limit, limit, 16
- ccmp data1, data2, 0, hi
- ccmp data1h, data2h, 0, eq
- b.eq L(loop16)
-
+L(loop32):
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
cmp data1, data2
- bne L(return)
- mov data1, data1h
- mov data2, data2h
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+ cmp limit, 16
+ b.ls L(last_bytes)
+
+ ldp data1, data3, [src1, 32]
+ ldp data2, data4, [src2, 32]
cmp data1, data2
- bne L(return)
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+ add src1, src1, 32
+ add src2, src2, 32
+L(last64):
+ subs limit, limit, 32
+ b.hi L(loop32)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
- add src1, src1, limit
- add src2, src2, limit
- ldp data1, data1h, [src1]
- ldp data2, data2h, [src2]
- cmp data1, data2
- bne L(return)
- mov data1, data1h
- mov data2, data2h
+ ldp data1, data3, [src1end, -16]
+ ldp data2, data4, [src2end, -16]
+L(return2):
cmp data1, data2
+ csel data1, data1, data3, ne
+ csel data2, data2, data4, ne
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
@@ -105,33 +86,105 @@ L(return):
rev data1, data1
rev data2, data2
#endif
- cmp data1, data2
-L(ret_eq):
+ cmp data1, data2
cset result, ne
cneg result, result, lo
ret
.p2align 4
- /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less16):
+ add src1end, src1, limit
+ add src2end, src2, limit
+ tbz limit, 3, L(less8)
+ ldr data1, [src1]
+ ldr data2, [src2]
+ ldr data3, [src1end, -8]
+ ldr data4, [src2end, -8]
+ b L(return2)
+
+ .p2align 4
L(less8):
- adds limit, limit, 4
- b.lo L(less4)
- ldr data1w, [src1], 4
- ldr data2w, [src2], 4
+ tbz limit, 2, L(less4)
+ ldr data1w, [src1]
+ ldr data2w, [src2]
+ ldr data3w, [src1end, -4]
+ ldr data4w, [src2end, -4]
+ b L(return2)
+
+L(less4):
+ tbz limit, 1, L(less2)
+ ldrh data1w, [src1]
+ ldrh data2w, [src2]
cmp data1w, data2w
b.ne L(return)
- sub limit, limit, 4
-L(less4):
- adds limit, limit, 4
- beq L(ret_eq)
-L(byte_loop):
- ldrb data1w, [src1], 1
- ldrb data2w, [src2], 1
- subs limit, limit, 1
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.eq L(byte_loop)
+L(less2):
+ mov result, 0
+ tbz limit, 0, L(return_zero)
+ ldrb data1w, [src1end, -1]
+ ldrb data2w, [src2end, -1]
sub result, data1w, data2w
+L(return_zero):
ret
-END (__memcmp_aarch64)
+L(loop_align):
+ ldp data1, data3, [src1, 16]
+ ldp data2, data4, [src2, 16]
+ cmp data1, data2
+ ccmp data3, data4, 0, eq
+ b.ne L(return2)
+
+ /* Align src2 and adjust src1, src2 and limit. */
+ and tmp, src2, 15
+ sub tmp, tmp, 16
+ sub src2, src2, tmp
+ add limit, limit, tmp
+ sub src1, src1, tmp
+ sub limit, limit, 64 + 16
+
+ .p2align 4
+L(loop64):
+ ldr q0, [src1, 16]
+ ldr q1, [src2, 16]
+ subs limit, limit, 64
+ ldr q2, [src1, 32]
+ ldr q3, [src2, 32]
+ eor v0.16b, v0.16b, v1.16b
+ eor v1.16b, v2.16b, v3.16b
+ ldr q2, [src1, 48]
+ ldr q3, [src2, 48]
+ umaxp v0.16b, v0.16b, v1.16b
+ ldr q4, [src1, 64]!
+ ldr q5, [src2, 64]!
+ eor v1.16b, v2.16b, v3.16b
+ eor v2.16b, v4.16b, v5.16b
+ umaxp v1.16b, v1.16b, v2.16b
+ umaxp v0.16b, v0.16b, v1.16b
+ umaxp v0.16b, v0.16b, v0.16b
+ fmov tmp, d0
+ ccmp tmp, 0, 0, hi
+ b.eq L(loop64)
+
+ /* If equal, process last 1-64 bytes using scalar loop. */
+ add limit, limit, 64 + 16
+ cbz tmp, L(last64)
+
+ /* Determine the 8-byte aligned offset of the first difference. */
+#ifdef __AARCH64EB__
+ rev16 tmp, tmp
+#endif
+ rev tmp, tmp
+ clz tmp, tmp
+ bic tmp, tmp, 7
+ sub tmp, tmp, 48
+ ldr data1, [src1, tmp]
+ ldr data2, [src2, tmp]
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ mov result, 1
+ cmp data1, data2
+ cneg result, result, lo
+ ret
+END (__memcmp_aarch64)
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index f97f2c3..e6527d0 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -1,8 +1,8 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define dstin x0
#define src x1
diff --git a/string/aarch64/memcpy-sve.S b/string/aarch64/memcpy-sve.S
new file mode 100644
index 0000000..e8a946d
--- /dev/null
+++ b/string/aarch64/memcpy-sve.S
@@ -0,0 +1,177 @@
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#ifdef HAVE_SVE
+
+.arch armv8-a+sve
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define tmp1 x6
+#define vlen x6
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+ SVE vectors are used to speedup small copies.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The source pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_sve)
+ENTRY (__memcpy_aarch64_sve)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ cmp count, 128
+ b.hi L(copy_long)
+ cntb vlen
+ cmp count, vlen, lsl 1
+ b.hi L(copy32_128)
+
+ whilelo p0.b, xzr, count
+ whilelo p1.b, vlen, count
+ ld1b z0.b, p0/z, [src, 0, mul vl]
+ ld1b z1.b, p1/z, [src, 1, mul vl]
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z1.b, p1, [dstin, 1, mul vl]
+ ret
+
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ add srcend, src, count
+ add dstend, dstin, count
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_q, F_q, [src, 32]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
+L(copy96):
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
+ ret
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ cbz tmp1, L(return)
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+L(return):
+ ret
+
+END (__memcpy_aarch64_sve)
+
+#endif
diff --git a/string/aarch64/memcpy.S b/string/aarch64/memcpy.S
index dd254f6..7c0606e 100644
--- a/string/aarch64/memcpy.S
+++ b/string/aarch64/memcpy.S
@@ -1,8 +1,8 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define dstin x0
#define src x1
diff --git a/string/aarch64/memrchr.S b/string/aarch64/memrchr.S
index 7b4be84..6418bdf 100644
--- a/string/aarch64/memrchr.S
+++ b/string/aarch64/memrchr.S
@@ -1,8 +1,8 @@
/*
* memrchr - find last character in a memory zone.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -23,7 +23,6 @@
#define synd x5
#define shift x6
#define tmp x7
-#define wtmp w7
#define end x8
#define endm1 x9
@@ -31,19 +30,16 @@
#define qdata q1
#define vdata v1
#define vhas_chr v2
-#define vrepmask v3
-#define vend v4
-#define dend d4
+#define vend v3
+#define dend d3
/*
Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__memrchr_aarch64)
PTR_ARG (0)
@@ -53,12 +49,9 @@ ENTRY (__memrchr_aarch64)
cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin
- mov wtmp, 0xf00f
- dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
neg shift, end, lsl 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsl synd, synd, shift
cbz synd, L(start_loop)
@@ -69,34 +62,36 @@ ENTRY (__memrchr_aarch64)
csel result, result, xzr, hi
ret
+ nop
L(start_loop):
- sub tmp, end, src
- subs cntrem, cntin, tmp
+ subs cntrem, src, srcin
b.ls L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
+ sub cntrem, cntrem, 1
+ tbz cntrem, 4, L(loop32_2)
+ add src, src, 16
- .p2align 4
+ .p2align 5
L(loop32):
- ldr qdata, [src, -16]!
+ ldr qdata, [src, -32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src, -16]!
+ ldr qdata, [src, -16]
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- b.ls L(end)
+ b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
+L(end_2):
+ sub src, src, 16
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
add tmp, src, 15
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index 9fcd975..553b0fc 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -1,8 +1,8 @@
/*
* memset - fill memory with a constant byte
*
- * Copyright (c) 2012-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
*
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define dstin x0
#define val x1
diff --git a/string/aarch64/stpcpy-mte.S b/string/aarch64/stpcpy-mte.S
deleted file mode 100644
index f1c7119..0000000
--- a/string/aarch64/stpcpy-mte.S
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * stpcpy - copy a string returning pointer to end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#define BUILD_STPCPY 1
-
-#include "strcpy-mte.S"
diff --git a/string/aarch64/stpcpy-sve.S b/string/aarch64/stpcpy-sve.S
index 82dd971..5d3f14b 100644
--- a/string/aarch64/stpcpy-sve.S
+++ b/string/aarch64/stpcpy-sve.S
@@ -2,7 +2,7 @@
* stpcpy - copy a string returning pointer to end.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define BUILD_STPCPY 1
diff --git a/string/aarch64/stpcpy.S b/string/aarch64/stpcpy.S
index 4f62aa4..155c68d 100644
--- a/string/aarch64/stpcpy.S
+++ b/string/aarch64/stpcpy.S
@@ -2,7 +2,7 @@
* stpcpy - copy a string returning pointer to end.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define BUILD_STPCPY 1
diff --git a/string/aarch64/strchr-mte.S b/string/aarch64/strchr-mte.S
index dcb0e46..6ec08f7 100644
--- a/string/aarch64/strchr-mte.S
+++ b/string/aarch64/strchr-mte.S
@@ -1,8 +1,8 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -19,8 +19,7 @@
#define src x2
#define tmp1 x1
-#define wtmp2 w3
-#define tmp3 x3
+#define tmp2 x3
#define vrepchr v0
#define vdata v1
@@ -28,39 +27,30 @@
#define vhas_nul v2
#define vhas_chr v3
#define vrepmask v4
-#define vrepmask2 v5
-#define vend v6
-#define dend d6
+#define vend v5
+#define dend d5
/* Core algorithm.
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
- requested character, bits 2-3 are set if the byte is NUL (or matched), and
- bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
- bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
- in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ per byte. Bits 0-1 are set if the relevant byte matched the requested
+ character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+ zeroes gives the position of the matching byte if it is a multiple of 4.
+ If it is not a multiple of 4, there was no match. */
ENTRY (__strchr_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
- mov wtmp2, 0x3003
- dup vrepmask.8h, wtmp2
+ movi vrepmask.16b, 0x33
cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- mov wtmp2, 0xf00f
- dup vrepmask2.8h, wtmp2
-
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- lsl tmp3, srcin, 2
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
-
+ lsl tmp2, srcin, 2
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
- lsr tmp1, tmp1, tmp3
+ lsr tmp1, tmp1, tmp2
cbz tmp1, L(loop)
rbit tmp1, tmp1
@@ -74,28 +64,34 @@ ENTRY (__strchr_aarch64_mte)
.p2align 4
L(loop):
- ldr qdata, [src, 16]!
+ ldr qdata, [src, 16]
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov tmp1, dend
+ cbnz tmp1, L(end)
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov tmp1, dend
cbz tmp1, L(loop)
+ sub src, src, 16
+L(end):
#ifdef __AARCH64EB__
bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
#else
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
rbit tmp1, tmp1
#endif
+ add src, src, 16
clz tmp1, tmp1
- /* Tmp1 is an even multiple of 2 if the target character was
- found first. Otherwise we've found the end of string. */
+ /* Tmp1 is a multiple of 4 if the target character was found. */
tst tmp1, 2
add result, src, tmp1, lsr 2
csel result, result, xzr, eq
diff --git a/string/aarch64/strchr-sve.S b/string/aarch64/strchr-sve.S
index 13ba9f4..ff07516 100644
--- a/string/aarch64/strchr-sve.S
+++ b/string/aarch64/strchr-sve.S
@@ -1,11 +1,11 @@
/*
* strchr/strchrnul - find a character in a string
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index 1063cbf..37193bd 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -1,8 +1,8 @@
/*
* strchr - find a character in a string
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/string/aarch64/strchrnul-mte.S b/string/aarch64/strchrnul-mte.S
index 1b0d0a6..543ee88 100644
--- a/string/aarch64/strchrnul-mte.S
+++ b/string/aarch64/strchrnul-mte.S
@@ -1,8 +1,8 @@
/*
* strchrnul - find a character or nul in a string
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -20,38 +20,32 @@
#define src x2
#define tmp1 x1
#define tmp2 x3
-#define tmp2w w3
#define vrepchr v0
#define vdata v1
#define qdata q1
#define vhas_nul v2
#define vhas_chr v3
-#define vrepmask v4
-#define vend v5
-#define dend d5
+#define vend v4
+#define dend d4
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (__strchrnul_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
- mov tmp2w, 0xf00f
- dup vrepmask.8h, tmp2w
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
lsl tmp2, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov tmp1, dend
lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
cbz tmp1, L(loop)
@@ -63,15 +57,22 @@ ENTRY (__strchrnul_aarch64_mte)
.p2align 4
L(loop):
- ldr qdata, [src, 16]!
+ ldr qdata, [src, 16]
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
+ fmov tmp1, dend
+ cbnz tmp1, L(end)
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b
fmov tmp1, dend
cbz tmp1, L(loop)
-
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ sub src, src, 16
+L(end):
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
+ add src, src, 16
fmov tmp1, dend
#ifndef __AARCH64EB__
rbit tmp1, tmp1
diff --git a/string/aarch64/strchrnul-sve.S b/string/aarch64/strchrnul-sve.S
index 428ff1a..0005f91 100644
--- a/string/aarch64/strchrnul-sve.S
+++ b/string/aarch64/strchrnul-sve.S
@@ -2,7 +2,7 @@
* strchrnul - find a character or nul in a string
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#define BUILD_STRCHRNUL
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index a4230d9..666e8d0 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -1,8 +1,8 @@
/*
* strchrnul - find a character or nul in a string
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Arguments and results. */
#define srcin x0
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
deleted file mode 100644
index 12d1a6b..0000000
--- a/string/aarch64/strcmp-mte.S
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * strcmp - compare two strings
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-#define src1 x0
-#define src2 x1
-#define result x0
-
-#define data1 x2
-#define data1w w2
-#define data2 x3
-#define data2w w3
-#define has_nul x4
-#define diff x5
-#define off1 x5
-#define syndrome x6
-#define tmp x6
-#define data3 x7
-#define zeroones x8
-#define shift x9
-#define off2 x10
-
-/* On big-endian early bytes are at MSB and on little-endian LSB.
- LS_FW means shifting towards early bytes. */
-#ifdef __AARCH64EB__
-# define LS_FW lsl
-#else
-# define LS_FW lsr
-#endif
-
-/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word.
- Since carry propagation makes 0x1 bytes before a NUL byte appear
- NUL too in big-endian, byte-reverse the data before the NUL check. */
-
-
-ENTRY (__strcmp_aarch64_mte)
- PTR_ARG (0)
- PTR_ARG (1)
- sub off2, src2, src1
- mov zeroones, REP8_01
- and tmp, src1, 7
- tst off2, 7
- b.ne L(misaligned8)
- cbnz tmp, L(mutual_align)
-
- .p2align 4
-
-L(loop_aligned):
- ldr data2, [src1, off2]
- ldr data1, [src1], 8
-L(start_realigned):
-#ifdef __AARCH64EB__
- rev tmp, data1
- sub has_nul, tmp, zeroones
- orr tmp, tmp, REP8_7f
-#else
- sub has_nul, data1, zeroones
- orr tmp, data1, REP8_7f
-#endif
- bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
- ccmp data1, data2, 0, eq
- b.eq L(loop_aligned)
-#ifdef __AARCH64EB__
- rev has_nul, has_nul
-#endif
- eor diff, data1, data2
- orr syndrome, diff, has_nul
-L(end):
-#ifndef __AARCH64EB__
- rev syndrome, syndrome
- rev data1, data1
- rev data2, data2
-#endif
- clz shift, syndrome
- /* The most-significant-non-zero bit of the syndrome marks either the
- first bit that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
- lsl data1, data1, shift
- lsl data2, data2, shift
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, 56
- sub result, data1, data2, lsr 56
- ret
-
- .p2align 4
-
-L(mutual_align):
- /* Sources are mutually aligned, but are not currently at an
- alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point. */
- bic src1, src1, 7
- ldr data2, [src1, off2]
- ldr data1, [src1], 8
- neg shift, src2, lsl 3 /* Bits to alignment -64. */
- mov tmp, -1
- LS_FW tmp, tmp, shift
- orr data1, data1, tmp
- orr data2, data2, tmp
- b L(start_realigned)
-
-L(misaligned8):
- /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
- checking to make sure that we don't access beyond the end of SRC2. */
- cbz tmp, L(src1_aligned)
-L(do_misaligned):
- ldrb data1w, [src1], 1
- ldrb data2w, [src2], 1
- cmp data1w, 0
- ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
- b.ne L(done)
- tst src1, 7
- b.ne L(do_misaligned)
-
-L(src1_aligned):
- neg shift, src2, lsl 3
- bic src2, src2, 7
- ldr data3, [src2], 8
-#ifdef __AARCH64EB__
- rev data3, data3
-#endif
- lsr tmp, zeroones, shift
- orr data3, data3, tmp
- sub has_nul, data3, zeroones
- orr tmp, data3, REP8_7f
- bics has_nul, has_nul, tmp
- b.ne L(tail)
-
- sub off1, src2, src1
-
- .p2align 4
-
-L(loop_unaligned):
- ldr data3, [src1, off1]
- ldr data2, [src1, off2]
-#ifdef __AARCH64EB__
- rev data3, data3
-#endif
- sub has_nul, data3, zeroones
- orr tmp, data3, REP8_7f
- ldr data1, [src1], 8
- bics has_nul, has_nul, tmp
- ccmp data1, data2, 0, eq
- b.eq L(loop_unaligned)
-
- lsl tmp, has_nul, shift
-#ifdef __AARCH64EB__
- rev tmp, tmp
-#endif
- eor diff, data1, data2
- orr syndrome, diff, tmp
- cbnz syndrome, L(end)
-L(tail):
- ldr data1, [src1]
- neg shift, shift
- lsr data2, data3, shift
- lsr has_nul, has_nul, shift
-#ifdef __AARCH64EB__
- rev data2, data2
- rev has_nul, has_nul
-#endif
- eor diff, data1, data2
- orr syndrome, diff, has_nul
- b L(end)
-
-L(done):
- sub result, data1, data2
- ret
-
-END (__strcmp_aarch64_mte)
-
diff --git a/string/aarch64/strcmp-sve.S b/string/aarch64/strcmp-sve.S
index e6d2da5..eaf909a 100644
--- a/string/aarch64/strcmp-sve.S
+++ b/string/aarch64/strcmp-sve.S
@@ -1,11 +1,11 @@
/*
* __strcmp_aarch64_sve - compare two strings
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 7714ebf..137a9aa 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -1,168 +1,184 @@
/*
* strcmp - compare two strings
*
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2012-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-/* Parameters and result. */
#define src1 x0
#define src2 x1
#define result x0
-/* Internal variables. */
#define data1 x2
#define data1w w2
#define data2 x3
#define data2w w3
#define has_nul x4
#define diff x5
+#define off1 x5
#define syndrome x6
-#define tmp1 x7
-#define tmp2 x8
-#define tmp3 x9
-#define zeroones x10
-#define pos x11
+#define tmp x6
+#define data3 x7
+#define zeroones x8
+#define shift x9
+#define off2 x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes. */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word.
+ Since carry propagation makes 0x1 bytes before a NUL byte appear
+ NUL too in big-endian, byte-reverse the data before the NUL check. */
+
- /* Start of performance-critical section -- one 64B cache line. */
ENTRY (__strcmp_aarch64)
PTR_ARG (0)
PTR_ARG (1)
- eor tmp1, src1, src2
- mov zeroones, #REP8_01
- tst tmp1, #7
+ sub off2, src2, src1
+ mov zeroones, REP8_01
+ and tmp, src1, 7
+ tst off2, 7
b.ne L(misaligned8)
- ands tmp1, src1, #7
- b.ne L(mutual_align)
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
+ cbnz tmp, L(mutual_align)
+
+ .p2align 4
+
L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
L(start_realigned):
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev tmp, data1
+ sub has_nul, tmp, zeroones
+ orr tmp, tmp, REP8_7f
+#else
+ sub has_nul, data1, zeroones
+ orr tmp, data1, REP8_7f
+#endif
+ bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_aligned)
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, L(loop_aligned)
- /* End of performance-critical section -- one 64B cache line. */
-
L(end):
-#ifndef __AARCH64EB__
+#ifndef __AARCH64EB__
rev syndrome, syndrome
rev data1, data1
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
- clz pos, syndrome
rev data2, data2
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-#else
- /* For big-endian we cannot use the trick with the syndrome value
- as carry-propagation can corrupt the upper bits if the trailing
- bytes in the string contain 0x01. */
- /* However, if there is no NUL byte in the dword, we can generate
- the result directly. We can't just subtract the bytes as the
- MSB might be significant. */
- cbnz has_nul, 1f
- cmp data1, data2
- cset result, ne
- cneg result, result, lo
- ret
-1:
- /* Re-compute the NUL-byte detection, using a byte-reversed value. */
- rev tmp3, data1
- sub tmp1, tmp3, zeroones
- orr tmp2, tmp3, #REP8_7f
- bic has_nul, tmp1, tmp2
- rev has_nul, has_nul
- orr syndrome, diff, has_nul
- clz pos, syndrome
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
+#endif
+ clz shift, syndrome
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
- lsl data1, data1, pos
- lsl data2, data2, pos
+ lsl data1, data1, shift
+ lsl data2, data2, shift
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
+ lsr data1, data1, 56
+ sub result, data1, data2, lsr 56
ret
-#endif
+
+ .p2align 4
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
- the bytes that preceed the start point. */
- bic src1, src1, #7
- bic src2, src2, #7
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- ldr data1, [src1], #8
- neg tmp1, tmp1 /* Bits to alignment -64. */
- ldr data2, [src2], #8
- mov tmp2, #~0
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#endif
- orr data1, data1, tmp2
- orr data2, data2, tmp2
+ the bytes that precede the start point. */
+ bic src1, src1, 7
+ ldr data2, [src1, off2]
+ ldr data1, [src1], 8
+ neg shift, src2, lsl 3 /* Bits to alignment -64. */
+ mov tmp, -1
+ LS_FW tmp, tmp, shift
+ orr data1, data1, tmp
+ orr data2, data2, tmp
b L(start_realigned)
L(misaligned8):
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
- checking to make sure that we don't access beyond page boundary in
- SRC2. */
- tst src1, #7
- b.eq L(loop_misaligned)
+ checking to make sure that we don't access beyond the end of SRC2. */
+ cbz tmp, L(src1_aligned)
L(do_misaligned):
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ cmp data1w, 0
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.ne L(done)
- tst src1, #7
+ tst src1, 7
b.ne L(do_misaligned)
-L(loop_misaligned):
- /* Test if we are within the last dword of the end of a 4K page. If
- yes then jump back to the misaligned loop to copy a byte at a time. */
- and tmp1, src2, #0xff8
- eor tmp1, tmp1, #0xff8
- cbz tmp1, L(do_misaligned)
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
+L(src1_aligned):
+ neg shift, src2, lsl 3
+ bic src2, src2, 7
+ ldr data3, [src2], 8
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ lsr tmp, zeroones, shift
+ orr data3, data3, tmp
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ bics has_nul, has_nul, tmp
+ b.ne L(tail)
+
+ sub off1, src2, src1
+
+ .p2align 4
+
+L(loop_unaligned):
+ ldr data3, [src1, off1]
+ ldr data2, [src1, off2]
+#ifdef __AARCH64EB__
+ rev data3, data3
+#endif
+ sub has_nul, data3, zeroones
+ orr tmp, data3, REP8_7f
+ ldr data1, [src1], 8
+ bics has_nul, has_nul, tmp
+ ccmp data1, data2, 0, eq
+ b.eq L(loop_unaligned)
+
+ lsl tmp, has_nul, shift
+#ifdef __AARCH64EB__
+ rev tmp, tmp
+#endif
+ eor diff, data1, data2
+ orr syndrome, diff, tmp
+ cbnz syndrome, L(end)
+L(tail):
+ ldr data1, [src1]
+ neg shift, shift
+ lsr data2, data3, shift
+ lsr has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+ rev data2, data2
+ rev has_nul, has_nul
+#endif
+ eor diff, data1, data2
orr syndrome, diff, has_nul
- cbz syndrome, L(loop_misaligned)
b L(end)
L(done):
diff --git a/string/aarch64/strcpy-mte.S b/string/aarch64/strcpy-mte.S
deleted file mode 100644
index 88c222d..0000000
--- a/string/aarch64/strcpy-mte.S
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * strcpy/stpcpy - copy a string returning pointer to start/end.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD.
- * MTE compatible.
- */
-
-#include "../asmdefs.h"
-
-#define dstin x0
-#define srcin x1
-#define result x0
-
-#define src x2
-#define dst x3
-#define len x4
-#define synd x4
-#define tmp x5
-#define wtmp w5
-#define shift x5
-#define data1 x6
-#define dataw1 w6
-#define data2 x7
-#define dataw2 w7
-
-#define dataq q0
-#define vdata v0
-#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
-#define dataq2 q1
-
-#ifdef BUILD_STPCPY
-# define STRCPY __stpcpy_aarch64_mte
-# define IFSTPCPY(X,...) X,__VA_ARGS__
-#else
-# define STRCPY __strcpy_aarch64_mte
-# define IFSTPCPY(X,...)
-#endif
-
-/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
-
-ENTRY (STRCPY)
- PTR_ARG (0)
- PTR_ARG (1)
- bic src, srcin, 15
- mov wtmp, 0xf00f
- ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
- cmeq vhas_nul.16b, vdata.16b, 0
- lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- lsr synd, synd, shift
- cbnz synd, L(tail)
-
- ldr dataq, [src, 16]!
- cmeq vhas_nul.16b, vdata.16b, 0
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(start_loop)
-
-#ifndef __AARCH64EB__
- rbit synd, synd
-#endif
- sub tmp, src, srcin
- clz len, synd
- add len, tmp, len, lsr 2
- tbz len, 4, L(less16)
- sub tmp, len, 15
- ldr dataq, [srcin]
- ldr dataq2, [srcin, tmp]
- str dataq, [dstin]
- str dataq2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4,,8
-L(tail):
- rbit synd, synd
- clz len, synd
- lsr len, len, 2
-
- .p2align 4
-L(less16):
- tbz len, 3, L(less8)
- sub tmp, len, 7
- ldr data1, [srcin]
- ldr data2, [srcin, tmp]
- str data1, [dstin]
- str data2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4
-L(less8):
- subs tmp, len, 3
- b.lo L(less4)
- ldr dataw1, [srcin]
- ldr dataw2, [srcin, tmp]
- str dataw1, [dstin]
- str dataw2, [dstin, tmp]
- IFSTPCPY (add result, dstin, len)
- ret
-
-L(less4):
- cbz len, L(zerobyte)
- ldrh dataw1, [srcin]
- strh dataw1, [dstin]
-L(zerobyte):
- strb wzr, [dstin, len]
- IFSTPCPY (add result, dstin, len)
- ret
-
- .p2align 4
-L(start_loop):
- sub len, src, srcin
- ldr dataq2, [srcin]
- add dst, dstin, len
- str dataq2, [dstin]
-
- .p2align 5
-L(loop):
- str dataq, [dst], 16
- ldr dataq, [src, 16]!
- cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
- fmov synd, dend
- cbz synd, L(loop)
-
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
- fmov synd, dend
-#ifndef __AARCH64EB__
- rbit synd, synd
-#endif
- clz len, synd
- lsr len, len, 2
- sub tmp, len, 15
- ldr dataq, [src, tmp]
- str dataq, [dst, tmp]
- IFSTPCPY (add result, dst, len)
- ret
-
-END (STRCPY)
diff --git a/string/aarch64/strcpy-sve.S b/string/aarch64/strcpy-sve.S
index f515462..00e72dc 100644
--- a/string/aarch64/strcpy-sve.S
+++ b/string/aarch64/strcpy-sve.S
@@ -1,11 +1,11 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 6e9ed42..97ae37e 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -1,311 +1,156 @@
/*
* strcpy/stpcpy - copy a string returning pointer to start/end.
*
- * Copyright (c) 2013-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
-/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
-
- To test the page crossing code path more thoroughly, compile with
- -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
- entry path. This option is not intended for production use. */
-
-/* Arguments and results. */
#define dstin x0
#define srcin x1
+#define result x0
-/* Locals and temporaries. */
#define src x2
#define dst x3
-#define data1 x4
-#define data1w w4
-#define data2 x5
-#define data2w w5
-#define has_nul1 x6
-#define has_nul2 x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define tmp4 x11
-#define zeroones x12
-#define data1a x13
-#define data2a x14
-#define pos x15
-#define len x16
-#define to_align x17
+#define len x4
+#define synd x4
+#define tmp x5
+#define shift x5
+#define data1 x6
+#define dataw1 w6
+#define data2 x7
+#define dataw2 w7
+
+#define dataq q0
+#define vdata v0
+#define vhas_nul v1
+#define vend v2
+#define dend d2
+#define dataq2 q1
#ifdef BUILD_STPCPY
-#define STRCPY __stpcpy_aarch64
-#else
-#define STRCPY __strcpy_aarch64
-#endif
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
- /* AArch64 systems have a minimum page size of 4k. We can do a quick
- page size check for crossing this boundary on entry and if we
- do not, then we can short-circuit much of the entry code. We
- expect early page-crossing strings to be rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
- predictable, even with random strings.
-
- We don't bother checking for larger page sizes, the cost of setting
- up the correct page size is just not worth the extra gain from
- a small reduction in the cases taking the slow path. Note that
- we only care about whether the first fetch, which may be
- misaligned, crosses a page boundary - after that we move to aligned
- fetches for the remainder of the string. */
-
-#ifdef STRCPY_TEST_PAGE_CROSS
- /* Make everything that isn't Qword aligned look like a page cross. */
-#define MIN_PAGE_P2 4
+# define STRCPY __stpcpy_aarch64
+# define IFSTPCPY(X,...) X,__VA_ARGS__
#else
-#define MIN_PAGE_P2 12
+# define STRCPY __strcpy_aarch64
+# define IFSTPCPY(X,...)
#endif
-#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
+/*
+ Core algorithm:
+ For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+ per byte. We take 4 bits of every comparison byte with shift right and narrow
+ by 4 instruction. Since the bits in the nibble mask reflect the order in
+ which things occur in the original string, counting leading zeros identifies
+ exactly which byte matched. */
ENTRY (STRCPY)
PTR_ARG (0)
PTR_ARG (1)
- /* For moderately short strings, the fastest way to do the copy is to
- calculate the length of the string in the same way as strlen, then
- essentially do a memcpy of the result. This avoids the need for
- multiple byte copies and further means that by the time we
- reach the bulk copy loop we know we can always use DWord
- accesses. We expect __strcpy_aarch64 to rarely be called repeatedly
- with the same source string, so branch prediction is likely to
- always be difficult - we mitigate against this by preferring
- conditional select operations over branches whenever this is
- feasible. */
- and tmp2, srcin, #(MIN_PAGE_SIZE - 1)
- mov zeroones, #REP8_01
- and to_align, srcin, #15
- cmp tmp2, #(MIN_PAGE_SIZE - 16)
- neg tmp1, to_align
- /* The first fetch will straddle a (possible) page boundary iff
- srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
- aligned string will never fail the page align check, so will
- always take the fast path. */
- b.gt L(page_cross)
-
-L(page_cross_ok):
- ldp data1, data2, [srcin]
-#ifdef __AARCH64EB__
- /* Because we expect the end to be found within 16 characters
- (profiling shows this is the most common case), it's worth
- swapping the bytes now to save having to recalculate the
- termination syndrome later. We preserve data1 and data2
- so that we can re-use the values later on. */
- rev tmp2, data1
- sub tmp1, tmp2, zeroones
- orr tmp2, tmp2, #REP8_7f
- bics has_nul1, tmp1, tmp2
- b.ne L(fp_le8)
- rev tmp4, data2
- sub tmp3, tmp4, zeroones
- orr tmp4, tmp4, #REP8_7f
-#else
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bics has_nul1, tmp1, tmp2
- b.ne L(fp_le8)
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
+ bic src, srcin, 15
+ ld1 {vdata.16b}, [src]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ lsl shift, srcin, 2
+ shrn vend.8b, vhas_nul.8h, 4
+ fmov synd, dend
+ lsr synd, synd, shift
+ cbnz synd, L(tail)
+
+ ldr dataq, [src, 16]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4
+ fmov synd, dend
+ cbz synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- bics has_nul2, tmp3, tmp4
- b.eq L(bulk_entry)
+ sub tmp, src, srcin
+ clz len, synd
+ add len, tmp, len, lsr 2
+ tbz len, 4, L(less16)
+ sub tmp, len, 15
+ ldr dataq, [srcin]
+ ldr dataq2, [srcin, tmp]
+ str dataq, [dstin]
+ str dataq2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
+ ret
- /* The string is short (<=16 bytes). We don't know exactly how
- short though, yet. Work out the exact length so that we can
- quickly select the optimal copy strategy. */
-L(fp_gt8):
- rev has_nul2, has_nul2
- clz pos, has_nul2
- mov tmp2, #56
- add dst, dstin, pos, lsr #3 /* Bits to bytes. */
- sub pos, tmp2, pos
-#ifdef __AARCH64EB__
- lsr data2, data2, pos
-#else
- lsl data2, data2, pos
-#endif
- str data2, [dst, #1]
+L(tail):
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 2
+L(less16):
+ tbz len, 3, L(less8)
+ sub tmp, len, 7
+ ldr data1, [srcin]
+ ldr data2, [srcin, tmp]
str data1, [dstin]
-#ifdef BUILD_STPCPY
- add dstin, dst, #8
-#endif
+ str data2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
ret
-L(fp_le8):
- rev has_nul1, has_nul1
- clz pos, has_nul1
- add dst, dstin, pos, lsr #3 /* Bits to bytes. */
- subs tmp2, pos, #24 /* Pos in bits. */
- b.lt L(fp_lt4)
-#ifdef __AARCH64EB__
- mov tmp2, #56
- sub pos, tmp2, pos
- lsr data2, data1, pos
- lsr data1, data1, #32
-#else
- lsr data2, data1, tmp2
-#endif
- /* 4->7 bytes to copy. */
- str data2w, [dst, #-3]
- str data1w, [dstin]
-#ifdef BUILD_STPCPY
- mov dstin, dst
-#endif
+ .p2align 4
+L(less8):
+ subs tmp, len, 3
+ b.lo L(less4)
+ ldr dataw1, [srcin]
+ ldr dataw2, [srcin, tmp]
+ str dataw1, [dstin]
+ str dataw2, [dstin, tmp]
+ IFSTPCPY (add result, dstin, len)
ret
-L(fp_lt4):
- cbz pos, L(fp_lt2)
- /* 2->3 bytes to copy. */
-#ifdef __AARCH64EB__
- lsr data1, data1, #48
-#endif
- strh data1w, [dstin]
- /* Fall-through, one byte (max) to go. */
-L(fp_lt2):
- /* Null-terminated string. Last character must be zero! */
- strb wzr, [dst]
-#ifdef BUILD_STPCPY
- mov dstin, dst
-#endif
- ret
-
- .p2align 6
- /* Aligning here ensures that the entry code and main loop all lies
- within one 64-byte cache line. */
-L(bulk_entry):
- sub to_align, to_align, #16
- stp data1, data2, [dstin]
- sub src, srcin, to_align
- sub dst, dstin, to_align
- b L(entry_no_page_cross)
-
- /* The inner loop deals with two Dwords at a time. This has a
- slightly higher start-up cost, but we should win quite quickly,
- especially on cores with a high number of issue slots per
- cycle, as we get much better parallelism out of the operations. */
-L(main_loop):
- stp data1, data2, [dst], #16
-L(entry_no_page_cross):
- ldp data1, data2, [src], #16
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq L(main_loop)
- /* Since we know we are copying at least 16 bytes, the fastest way
- to deal with the tail is to determine the location of the
- trailing NUL, then (re)copy the 16 bytes leading up to that. */
- cmp has_nul1, #0
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- csel data1, data1, data2, ne
- rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- bic has_nul1, tmp1, tmp2
-#else
- csel has_nul1, has_nul1, has_nul2, ne
-#endif
- rev has_nul1, has_nul1
- clz pos, has_nul1
- add tmp1, pos, #72
- add pos, pos, #8
- csel pos, pos, tmp1, ne
- add src, src, pos, lsr #3
- add dst, dst, pos, lsr #3
- ldp data1, data2, [src, #-32]
- stp data1, data2, [dst, #-16]
-#ifdef BUILD_STPCPY
- sub dstin, dst, #1
-#endif
+L(less4):
+ cbz len, L(zerobyte)
+ ldrh dataw1, [srcin]
+ strh dataw1, [dstin]
+L(zerobyte):
+ strb wzr, [dstin, len]
+ IFSTPCPY (add result, dstin, len)
ret
-L(page_cross):
- bic src, srcin, #15
- /* Start by loading two words at [srcin & ~15], then forcing the
- bytes that precede srcin to 0xff. This means they never look
- like termination bytes. */
- ldp data1, data2, [src]
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- tst to_align, #7
- csetm tmp2, ne
-#ifdef __AARCH64EB__
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
+ .p2align 4
+L(start_loop):
+ sub tmp, srcin, dstin
+ ldr dataq2, [srcin]
+ sub dst, src, tmp
+ str dataq2, [dstin]
+L(loop):
+ str dataq, [dst], 32
+ ldr dataq, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loopend)
+ str dataq, [dst, -16]
+ ldr dataq, [src, 32]!
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop)
+ add dst, dst, 16
+L(loopend):
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ sub dst, dst, 31
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- orr data1, data1, tmp2
- orr data2a, data2, tmp2
- cmp to_align, #8
- csinv data1, data1, xzr, lt
- csel data2, data2, data2a, lt
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
- bic has_nul1, tmp1, tmp2
- bics has_nul2, tmp3, tmp4
- ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq L(page_cross_ok)
- /* We now need to make data1 and data2 look like they've been
- loaded directly from srcin. Do a rotate on the 128-bit value. */
- lsl tmp1, to_align, #3 /* Bytes->bits. */
- neg tmp2, to_align, lsl #3
-#ifdef __AARCH64EB__
- lsl data1a, data1, tmp1
- lsr tmp4, data2, tmp2
- lsl data2, data2, tmp1
- orr tmp4, tmp4, data1a
- cmp to_align, #8
- csel data1, tmp4, data2, lt
- rev tmp2, data1
- rev tmp4, data2
- sub tmp1, tmp2, zeroones
- orr tmp2, tmp2, #REP8_7f
- sub tmp3, tmp4, zeroones
- orr tmp4, tmp4, #REP8_7f
-#else
- lsr data1a, data1, tmp1
- lsl tmp4, data2, tmp2
- lsr data2, data2, tmp1
- orr tmp4, tmp4, data1a
- cmp to_align, #8
- csel data1, tmp4, data2, lt
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, #REP8_7f
-#endif
- bic has_nul1, tmp1, tmp2
- cbnz has_nul1, L(fp_le8)
- bic has_nul2, tmp3, tmp4
- b L(fp_gt8)
+ clz len, synd
+ lsr len, len, 2
+ add dst, dst, len
+ ldr dataq, [dst, tmp]
+ str dataq, [dst]
+ IFSTPCPY (add result, dst, 15)
+ ret
END (STRCPY)
-
diff --git a/string/aarch64/strlen-mte.S b/string/aarch64/strlen-mte.S
index 7cf41d5..7723579 100644
--- a/string/aarch64/strlen-mte.S
+++ b/string/aarch64/strlen-mte.S
@@ -1,8 +1,8 @@
/*
* strlen - calculate the length of a string.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define result x0
@@ -19,35 +19,26 @@
#define src x1
#define synd x2
#define tmp x3
-#define wtmp w3
#define shift x4
#define data q0
#define vdata v0
#define vhas_nul v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
/* Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
ENTRY (__strlen_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
- mov wtmp, 0xf00f
ld1 {vdata.16b}, [src]
- dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(loop)
@@ -59,19 +50,25 @@ ENTRY (__strlen_aarch64_mte)
.p2align 5
L(loop):
- ldr data, [src, 16]!
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loop_end)
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop)
-
- and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ sub src, src, 16
+L(loop_end):
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
sub result, src, srcin
fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
+ add result, result, 16
clz tmp, synd
add result, result, tmp, lsr 2
ret
diff --git a/string/aarch64/strlen-sve.S b/string/aarch64/strlen-sve.S
index 2392493..12ebbdb 100644
--- a/string/aarch64/strlen-sve.S
+++ b/string/aarch64/strlen-sve.S
@@ -1,11 +1,11 @@
/*
* __strlen_aarch64_sve - compute the length of a string
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index a1b164a..6f6f08f 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,8 +1,8 @@
/*
* strlen - calculate the length of a string.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Not MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define len x0
@@ -36,6 +36,7 @@
#define tmp x2
#define tmpw w2
#define synd x3
+#define syndw w3
#define shift x4
/* For the first 32 bytes, NUL detection works on the principle that
@@ -110,7 +111,6 @@ ENTRY (__strlen_aarch64)
add len, len, tmp1, lsr 3
ret
- .p2align 3
/* Look for a NUL byte at offset 16..31 in the string. */
L(bytes16_31):
ldp data1, data2, [srcin, 16]
@@ -138,6 +138,7 @@ L(bytes16_31):
add len, len, tmp1, lsr 3
ret
+ nop
L(loop_entry):
bic src, srcin, 31
@@ -153,18 +154,12 @@ L(loop):
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
cmeq maskv.16b, datav1.16b, 0
sub len, src, srcin
- tst synd, 0xffffffff
- b.ne 1f
+ cbnz syndw, 1f
cmeq maskv.16b, datav2.16b, 0
add len, len, 16
1:
/* Generate a bitmask and compute correct byte offset. */
-#ifdef __AARCH64EB__
- bic maskv.8h, 0xf0
-#else
- bic maskv.8h, 0x0f, lsl 8
-#endif
- umaxp maskv.16b, maskv.16b, maskv.16b
+ shrn maskv.8b, maskv.8h, 4
fmov synd, maskd
#ifndef __AARCH64EB__
rbit synd, synd
@@ -173,8 +168,6 @@ L(loop):
add len, len, tmp, lsr 2
ret
- .p2align 4
-
L(page_cross):
bic src, srcin, 31
mov tmpw, 0x0c03
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
deleted file mode 100644
index c9d6fc8..0000000
--- a/string/aarch64/strncmp-mte.S
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * strncmp - compare two strings
- *
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64
- */
-
-#include "../asmdefs.h"
-
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-
-/* Parameters and result. */
-#define src1 x0
-#define src2 x1
-#define limit x2
-#define result x0
-
-/* Internal variables. */
-#define data1 x3
-#define data1w w3
-#define data2 x4
-#define data2w w4
-#define has_nul x5
-#define diff x6
-#define syndrome x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define zeroones x11
-#define pos x12
-#define mask x13
-#define endloop x14
-#define count mask
-#define offset pos
-#define neg_offset x15
-
-/* Define endian dependent shift operations.
- On big-endian early bytes are at MSB and on little-endian LSB.
- LS_FW means shifting towards early bytes.
- LS_BK means shifting towards later bytes.
- */
-#ifdef __AARCH64EB__
-#define LS_FW lsl
-#define LS_BK lsr
-#else
-#define LS_FW lsr
-#define LS_BK lsl
-#endif
-
-ENTRY (__strncmp_aarch64_mte)
- PTR_ARG (0)
- PTR_ARG (1)
- SIZE_ARG (2)
- cbz limit, L(ret0)
- eor tmp1, src1, src2
- mov zeroones, #REP8_01
- tst tmp1, #7
- and count, src1, #7
- b.ne L(misaligned8)
- cbnz count, L(mutual_align)
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. */
- .p2align 4
-L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-L(start_realigned):
- subs limit, limit, #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, hi /* Last Dword or differences. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp endloop, #0, #0, eq
- b.eq L(loop_aligned)
- /* End of main loop */
-
-L(full_check):
-#ifndef __AARCH64EB__
- orr syndrome, diff, has_nul
- add limit, limit, 8 /* Rewind limit to before last subs. */
-L(syndrome_check):
- /* Limit was reached. Check if the NUL byte or the difference
- is before the limit. */
- rev syndrome, syndrome
- rev data1, data1
- clz pos, syndrome
- rev data2, data2
- lsl data1, data1, pos
- cmp limit, pos, lsr #3
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- csel result, result, xzr, hi
- ret
-#else
- /* Not reached the limit, must have found the end or a diff. */
- tbz limit, #63, L(not_limit)
- add tmp1, limit, 8
- cbz limit, L(not_limit)
-
- lsl limit, tmp1, #3 /* Bits -> bytes. */
- mov mask, #~0
- lsr mask, mask, limit
- bic data1, data1, mask
- bic data2, data2, mask
-
- /* Make sure that the NUL byte is marked in the syndrome. */
- orr has_nul, has_nul, mask
-
-L(not_limit):
- /* For big-endian we cannot use the trick with the syndrome value
- as carry-propagation can corrupt the upper bits if the trailing
- bytes in the string contain 0x01. */
- /* However, if there is no NUL byte in the dword, we can generate
- the result directly. We can't just subtract the bytes as the
- MSB might be significant. */
- cbnz has_nul, 1f
- cmp data1, data2
- cset result, ne
- cneg result, result, lo
- ret
-1:
- /* Re-compute the NUL-byte detection, using a byte-reversed value. */
- rev tmp3, data1
- sub tmp1, tmp3, zeroones
- orr tmp2, tmp3, #REP8_7f
- bic has_nul, tmp1, tmp2
- rev has_nul, has_nul
- orr syndrome, diff, has_nul
- clz pos, syndrome
- /* The most-significant-non-zero bit of the syndrome marks either the
- first bit that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
-L(end_quick):
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- ret
-#endif
-
-L(mutual_align):
- /* Sources are mutually aligned, but are not currently at an
- alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point.
- We also need to adjust the limit calculations, but without
- overflowing if the limit is near ULONG_MAX. */
- bic src1, src1, #7
- bic src2, src2, #7
- ldr data1, [src1], #8
- neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
- ldr data2, [src2], #8
- mov tmp2, #~0
- LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
- /* Adjust the limit and ensure it doesn't overflow. */
- adds limit, limit, count
- csinv limit, limit, xzr, lo
- orr data1, data1, tmp2
- orr data2, data2, tmp2
- b L(start_realigned)
-
- .p2align 4
- /* Don't bother with dwords for up to 16 bytes. */
-L(misaligned8):
- cmp limit, #16
- b.hs L(try_misaligned_words)
-
-L(byte_loop):
- /* Perhaps we can do better than this. */
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
- ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq L(byte_loop)
-L(done):
- sub result, data1, data2
- ret
- /* Align the SRC1 to a dword by doing a bytewise compare and then do
- the dword loop. */
-L(try_misaligned_words):
- cbz count, L(src1_aligned)
-
- neg count, count
- and count, count, #7
- sub limit, limit, count
-
-L(page_end_loop):
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- cmp data1w, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.ne L(done)
- subs count, count, #1
- b.hi L(page_end_loop)
-
- /* The following diagram explains the comparison of misaligned strings.
- The bytes are shown in natural order. For little-endian, it is
- reversed in the registers. The "x" bytes are before the string.
- The "|" separates data that is loaded at one time.
- src1 | a a a a a a a a | b b b c c c c c | . . .
- src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
-
- After shifting in each step, the data looks like this:
- STEP_A STEP_B STEP_C
- data1 a a a a a a a a b b b c c c c c b b b c c c c c
- data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
-
- The bytes with "0" are eliminated from the syndrome via mask.
-
- Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
- time from SRC2. The comparison happens in 3 steps. After each step
- the loop can exit, or read from SRC1 or SRC2. */
-L(src1_aligned):
- /* Calculate offset from 8 byte alignment to string start in bits. No
- need to mask offset since shifts are ignoring upper bits. */
- lsl offset, src2, #3
- bic src2, src2, #0xf
- mov mask, -1
- neg neg_offset, offset
- ldr data1, [src1], #8
- ldp tmp1, tmp2, [src2], #16
- LS_BK mask, mask, neg_offset
- and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
- /* Skip the first compare if data in tmp1 is irrelevant. */
- tbnz offset, 6, L(misaligned_mid_loop)
-
-L(loop_misaligned):
- /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
- LS_FW data2, tmp1, offset
- LS_BK tmp1, tmp2, neg_offset
- subs limit, limit, #8
- orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
- sub has_nul, data1, zeroones
- eor diff, data1, data2 /* Non-zero if differences found. */
- orr tmp3, data1, #REP8_7f
- csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
- bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
- orr tmp3, endloop, has_nul
- cbnz tmp3, L(full_check)
-
- ldr data1, [src1], #8
-L(misaligned_mid_loop):
- /* STEP_B: Compare first part of data1 to second part of tmp2. */
- LS_FW data2, tmp2, offset
-#ifdef __AARCH64EB__
- /* For big-endian we do a byte reverse to avoid carry-propagation
- problem described above. This way we can reuse the has_nul in the
- next step and also use syndrome value trick at the end. */
- rev tmp3, data1
- #define data1_fixed tmp3
-#else
- #define data1_fixed data1
-#endif
- sub has_nul, data1_fixed, zeroones
- orr tmp3, data1_fixed, #REP8_7f
- eor diff, data2, data1 /* Non-zero if differences found. */
- bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
-#ifdef __AARCH64EB__
- rev has_nul, has_nul
-#endif
- cmp limit, neg_offset, lsr #3
- orr syndrome, diff, has_nul
- bic syndrome, syndrome, mask /* Ignore later bytes. */
- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
- cbnz tmp3, L(syndrome_check)
-
- /* STEP_C: Compare second part of data1 to first part of tmp1. */
- ldp tmp1, tmp2, [src2], #16
- cmp limit, #8
- LS_BK data2, tmp1, neg_offset
- eor diff, data2, data1 /* Non-zero if differences found. */
- orr syndrome, diff, has_nul
- and syndrome, syndrome, mask /* Ignore earlier bytes. */
- csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
- cbnz tmp3, L(syndrome_check)
-
- ldr data1, [src1], #8
- sub limit, limit, #8
- b L(loop_misaligned)
-
-#ifdef __AARCH64EB__
-L(syndrome_check):
- clz pos, syndrome
- cmp pos, limit, lsl #3
- b.lo L(end_quick)
-#endif
-
-L(ret0):
- mov result, #0
- ret
-END(__strncmp_aarch64_mte)
-
diff --git a/string/aarch64/strncmp-sve.S b/string/aarch64/strncmp-sve.S
index 234190e..6a9e9f7 100644
--- a/string/aarch64/strncmp-sve.S
+++ b/string/aarch64/strncmp-sve.S
@@ -1,11 +1,11 @@
/*
* strncmp - compare two strings with limit
*
- * Copyright (c) 2018-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index 738b653..128a10c 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -1,20 +1,20 @@
/*
* strncmp - compare two strings
*
- * Copyright (c) 2013-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2013-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64.
+ * MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
/* Parameters and result. */
#define src1 x0
@@ -35,10 +35,24 @@
#define tmp3 x10
#define zeroones x11
#define pos x12
-#define limit_wd x13
-#define mask x14
-#define endloop x15
+#define mask x13
+#define endloop x14
#define count mask
+#define offset pos
+#define neg_offset x15
+
+/* Define endian dependent shift operations.
+ On big-endian early bytes are at MSB and on little-endian LSB.
+ LS_FW means shifting towards early bytes.
+ LS_BK means shifting towards later bytes.
+ */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
ENTRY (__strncmp_aarch64)
PTR_ARG (0)
@@ -51,9 +65,6 @@ ENTRY (__strncmp_aarch64)
and count, src1, #7
b.ne L(misaligned8)
cbnz count, L(mutual_align)
- /* Calculate the number of full and partial words -1. */
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
- lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -63,56 +74,52 @@ L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
- subs limit_wd, limit_wd, #1
+ subs limit, limit, #8
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, pl /* Last Dword or differences. */
+ csinv endloop, diff, xzr, hi /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
b.eq L(loop_aligned)
/* End of main loop */
- /* Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, L(not_limit)
-
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq L(not_limit)
-
- lsl limit, limit, #3 /* Bits -> bytes. */
- mov mask, #~0
-#ifdef __AARCH64EB__
- lsr mask, mask, limit
-#else
- lsl mask, mask, limit
-#endif
- bic data1, data1, mask
- bic data2, data2, mask
-
- /* Make sure that the NUL byte is marked in the syndrome. */
- orr has_nul, has_nul, mask
-
-L(not_limit):
+L(full_check):
+#ifndef __AARCH64EB__
orr syndrome, diff, has_nul
-
-#ifndef __AARCH64EB__
+ add limit, limit, 8 /* Rewind limit to before last subs. */
+L(syndrome_check):
+ /* Limit was reached. Check if the NUL byte or the difference
+ is before the limit. */
rev syndrome, syndrome
rev data1, data1
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
- Shifting left now will bring the critical information into the
- top bits. */
clz pos, syndrome
rev data2, data2
lsl data1, data1, pos
+ cmp limit, pos, lsr #3
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
+ csel result, result, xzr, hi
ret
#else
+ /* Not reached the limit, must have found the end or a diff. */
+ tbz limit, #63, L(not_limit)
+ add tmp1, limit, 8
+ cbz limit, L(not_limit)
+
+ lsl limit, tmp1, #3 /* Bits -> bytes. */
+ mov mask, #~0
+ lsr mask, mask, limit
+ bic data1, data1, mask
+ bic data2, data2, mask
+
+ /* Make sure that the NUL byte is marked in the syndrome. */
+ orr has_nul, has_nul, mask
+
+L(not_limit):
/* For big-endian we cannot use the trick with the syndrome value
as carry-propagation can corrupt the upper bits if the trailing
bytes in the string contain 0x01. */
@@ -133,10 +140,11 @@ L(not_limit):
rev has_nul, has_nul
orr syndrome, diff, has_nul
clz pos, syndrome
- /* The MS-non-zero bit of the syndrome marks either the first bit
- that is different, or the top bit of the first zero byte.
+ /* The most-significant-non-zero bit of the syndrome marks either the
+ first bit that is different, or the top bit of the first zero byte.
Shifting left now will bring the critical information into the
top bits. */
+L(end_quick):
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
@@ -158,22 +166,12 @@ L(mutual_align):
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
- sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
-#endif
- and tmp3, limit_wd, #7
- lsr limit_wd, limit_wd, #3
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, count
- add tmp3, tmp3, count
+ LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
+ /* Adjust the limit and ensure it doesn't overflow. */
+ adds limit, limit, count
+ csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
- add limit_wd, limit_wd, tmp3, lsr #3
b L(start_realigned)
.p2align 4
@@ -196,13 +194,11 @@ L(done):
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
L(try_misaligned_words):
- lsr limit_wd, limit, #3
- cbz count, L(do_misaligned)
+ cbz count, L(src1_aligned)
neg count, count
and count, count, #7
sub limit, limit, count
- lsr limit_wd, limit, #3
L(page_end_loop):
ldrb data1w, [src1], #1
@@ -213,48 +209,100 @@ L(page_end_loop):
subs count, count, #1
b.hi L(page_end_loop)
-L(do_misaligned):
- /* Prepare ourselves for the next page crossing. Unlike the aligned
- loop, we fetch 1 less dword because we risk crossing bounds on
- SRC2. */
- mov count, #8
- subs limit_wd, limit_wd, #1
- b.lo L(done_loop)
-L(loop_misaligned):
- and tmp2, src2, #0xff8
- eor tmp2, tmp2, #0xff8
- cbz tmp2, L(page_end_loop)
+ /* The following diagram explains the comparison of misaligned strings.
+ The bytes are shown in natural order. For little-endian, it is
+ reversed in the registers. The "x" bytes are before the string.
+ The "|" separates data that is loaded at one time.
+ src1 | a a a a a a a a | b b b c c c c c | . . .
+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
+
+ After shifting in each step, the data looks like this:
+ STEP_A STEP_B STEP_C
+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
+ The bytes with "0" are eliminated from the syndrome via mask.
+
+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+ time from SRC2. The comparison happens in 3 steps. After each step
+ the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+ /* Calculate offset from 8 byte alignment to string start in bits. No
+ need to mask offset since shifts are ignoring upper bits. */
+ lsl offset, src2, #3
+ bic src2, src2, #0xf
+ mov mask, -1
+ neg neg_offset, offset
ldr data1, [src1], #8
- ldr data2, [src2], #8
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
- eor diff, data1, data2 /* Non-zero if differences found. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp diff, #0, #0, eq
- b.ne L(not_limit)
- subs limit_wd, limit_wd, #1
- b.pl L(loop_misaligned)
+ ldp tmp1, tmp2, [src2], #16
+ LS_BK mask, mask, neg_offset
+ and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
+ /* Skip the first compare if data in tmp1 is irrelevant. */
+ tbnz offset, 6, L(misaligned_mid_loop)
-L(done_loop):
- /* We found a difference or a NULL before the limit was reached. */
- and limit, limit, #7
- cbz limit, L(not_limit)
- /* Read the last word. */
- sub src1, src1, 8
- sub src2, src2, 8
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- sub tmp1, data1, zeroones
- orr tmp2, data1, #REP8_7f
+L(loop_misaligned):
+ /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+ LS_FW data2, tmp1, offset
+ LS_BK tmp1, tmp2, neg_offset
+ subs limit, limit, #8
+ orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
+ sub has_nul, data1, zeroones
eor diff, data1, data2 /* Non-zero if differences found. */
- bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
- ccmp diff, #0, #0, eq
- b.ne L(not_limit)
+ orr tmp3, data1, #REP8_7f
+ csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
+ orr tmp3, endloop, has_nul
+ cbnz tmp3, L(full_check)
+
+ ldr data1, [src1], #8
+L(misaligned_mid_loop):
+ /* STEP_B: Compare first part of data1 to second part of tmp2. */
+ LS_FW data2, tmp2, offset
+#ifdef __AARCH64EB__
+ /* For big-endian we do a byte reverse to avoid carry-propagation
+ problem described above. This way we can reuse the has_nul in the
+ next step and also use syndrome value trick at the end. */
+ rev tmp3, data1
+ #define data1_fixed tmp3
+#else
+ #define data1_fixed data1
+#endif
+ sub has_nul, data1_fixed, zeroones
+ orr tmp3, data1_fixed, #REP8_7f
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
+#ifdef __AARCH64EB__
+ rev has_nul, has_nul
+#endif
+ cmp limit, neg_offset, lsr #3
+ orr syndrome, diff, has_nul
+ bic syndrome, syndrome, mask /* Ignore later bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ /* STEP_C: Compare second part of data1 to first part of tmp1. */
+ ldp tmp1, tmp2, [src2], #16
+ cmp limit, #8
+ LS_BK data2, tmp1, neg_offset
+ eor diff, data2, data1 /* Non-zero if differences found. */
+ orr syndrome, diff, has_nul
+ and syndrome, syndrome, mask /* Ignore earlier bytes. */
+ csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
+ cbnz tmp3, L(syndrome_check)
+
+ ldr data1, [src1], #8
+ sub limit, limit, #8
+ b L(loop_misaligned)
+
+#ifdef __AARCH64EB__
+L(syndrome_check):
+ clz pos, syndrome
+ cmp pos, limit, lsl #3
+ b.lo L(end_quick)
+#endif
L(ret0):
mov result, #0
ret
-
-END ( __strncmp_aarch64)
+END(__strncmp_aarch64)
diff --git a/string/aarch64/strnlen-sve.S b/string/aarch64/strnlen-sve.S
index 5b9ebf7..6c43dc4 100644
--- a/string/aarch64/strnlen-sve.S
+++ b/string/aarch64/strnlen-sve.S
@@ -1,11 +1,11 @@
/*
* strnlen - calculate the length of a string with limit.
*
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index 48d2495..f2090a7 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -1,8 +1,8 @@
/*
* strnlen - calculate the length of a string with limit.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define cntin x1
@@ -20,39 +20,30 @@
#define src x2
#define synd x3
#define shift x4
-#define wtmp w4
#define tmp x4
#define cntrem x5
#define qdata q0
#define vdata v0
#define vhas_chr v1
-#define vrepmask v2
-#define vend v3
-#define dend d3
+#define vend v2
+#define dend d2
/*
Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
- requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
- set likewise for odd bytes so that adjacent bytes can be merged. Since the
- bits in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
ENTRY (__strnlen_aarch64)
PTR_ARG (0)
SIZE_ARG (1)
bic src, srcin, 15
- mov wtmp, 0xf00f
cbz cntin, L(nomatch)
- ld1 {vdata.16b}, [src], 16
- dup vrepmask.8h, wtmp
+ ld1 {vdata.16b}, [src]
cmeq vhas_chr.16b, vdata.16b, 0
lsl shift, srcin, 2
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
cbz synd, L(start_loop)
@@ -64,37 +55,40 @@ L(finish):
csel result, cntin, result, ls
ret
+L(nomatch):
+ mov result, cntin
+ ret
+
L(start_loop):
sub tmp, src, srcin
+ add tmp, tmp, 17
subs cntrem, cntin, tmp
- b.ls L(nomatch)
+ b.lo L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
+ tbz cntrem, 4, L(loop32_2)
+ sub src, src, 16
.p2align 5
L(loop32):
- ldr qdata, [src], 16
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, 0
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src], 16
+ ldr qdata, [src, 16]
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, 0
- b.ls L(end)
+ b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
-
+L(end_2):
+ add src, src, 16
L(end):
- and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
- addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
- sub src, src, 16
- mov synd, vend.d[0]
+ shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
sub result, src, srcin
+ fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
@@ -104,9 +98,5 @@ L(end):
csel result, cntin, result, ls
ret
-L(nomatch):
- mov result, cntin
- ret
-
END (__strnlen_aarch64)
diff --git a/string/aarch64/strrchr-mte.S b/string/aarch64/strrchr-mte.S
index 1e4fb1a..bb61ab9 100644
--- a/string/aarch64/strrchr-mte.S
+++ b/string/aarch64/strrchr-mte.S
@@ -1,8 +1,8 @@
/*
* strrchr - find last position of a character in a string.
*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* MTE compatible.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#define srcin x0
#define chrin w1
@@ -19,7 +19,6 @@
#define src x2
#define tmp x3
-#define wtmp w3
#define synd x3
#define shift x4
#define src_match x4
@@ -31,7 +30,6 @@
#define vhas_nul v2
#define vhas_chr v3
#define vrepmask v4
-#define vrepmask2 v5
#define vend v5
#define dend d5
@@ -47,55 +45,67 @@ ENTRY (__strrchr_aarch64_mte)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
- mov wtmp, 0x3003
- dup vrepmask.8h, wtmp
- tst srcin, 15
- beq L(loop1)
-
- ld1 {vdata.16b}, [src], 16
+ movi vrepmask.16b, 0x33
+ ld1 {vdata.16b}, [src]
cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- mov wtmp, 0xf00f
- dup vrepmask2.8h, wtmp
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4
lsl shift, srcin, 2
fmov synd, dend
lsr synd, synd, shift
lsl synd, synd, shift
ands nul_match, synd, 0xcccccccccccccccc
bne L(tail)
- cbnz synd, L(loop2)
+ cbnz synd, L(loop2_start)
- .p2align 5
+ .p2align 4
L(loop1):
- ld1 {vdata.16b}, [src], 16
+ ldr q1, [src, 16]
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbnz synd, L(loop1_end)
+ ldr q1, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend
cbz synd, L(loop1)
-
+ sub src, src, 16
+L(loop1_end):
+ add src, src, 16
cmeq vhas_nul.16b, vdata.16b, 0
+#ifdef __AARCH64EB__
+ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ shrn vend.8b, vhas_nul.8h, 4
+ fmov synd, dend
+ rbit synd, synd
+#else
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- bic vhas_nul.8h, 0x0f, lsl 8
- addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ shrn vend.8b, vhas_nul.8h, 4
fmov synd, dend
+#endif
ands nul_match, synd, 0xcccccccccccccccc
- beq L(loop2)
-
+ beq L(loop2_start)
L(tail):
sub nul_match, nul_match, 1
and chr_match, synd, 0x3333333333333333
ands chr_match, chr_match, nul_match
- sub result, src, 1
+ add result, src, 15
clz tmp, chr_match
sub result, result, tmp, lsr 2
csel result, result, xzr, ne
ret
.p2align 4
+ nop
+ nop
+L(loop2_start):
+ add src, src, 16
+ bic vrepmask.8h, 0xf0
+
L(loop2):
cmp synd, 0
csel src_match, src, src_match, ne
diff --git a/string/aarch64/strrchr-sve.S b/string/aarch64/strrchr-sve.S
index d36d69a..825a738 100644
--- a/string/aarch64/strrchr-sve.S
+++ b/string/aarch64/strrchr-sve.S
@@ -1,11 +1,11 @@
/*
* strrchr - find the last of a character in a string
*
- * Copyright (c) 2019-2021, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
#if __ARM_FEATURE_SVE
/* Assumptions:
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
index 56185ff..bf9cb29 100644
--- a/string/aarch64/strrchr.S
+++ b/string/aarch64/strrchr.S
@@ -1,8 +1,8 @@
/*
* strrchr - find last position of a character in a string.
*
- * Copyright (c) 2014-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2014-2022, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* Assumptions:
@@ -11,7 +11,7 @@
* Neon Available.
*/
-#include "../asmdefs.h"
+#include "asmdefs.h"
/* Arguments and results. */
#define srcin x0