aboutsummaryrefslogtreecommitdiff
path: root/string/aarch64/strlen.S
diff options
context:
space:
mode:
Diffstat (limited to 'string/aarch64/strlen.S')
-rw-r--r--string/aarch64/strlen.S274
1 files changed, 134 insertions, 140 deletions
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 2293f73..a1b164a 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,84 +1,88 @@
/*
- * strlen - calculate the length of a string
+ * strlen - calculate the length of a string.
*
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
*/
#include "../asmdefs.h"
-/* To test the page crossing code path more thoroughly, compile with
- -DTEST_PAGE_CROSS - this will force all calls through the slower
- entry path. This option is not intended for production use. */
-
-/* Arguments and results. */
-#define srcin x0
-#define len x0
-
-/* Locals and temporaries. */
-#define src x1
-#define data1 x2
-#define data2 x3
-#define has_nul1 x4
-#define has_nul2 x5
-#define tmp1 x4
-#define tmp2 x5
-#define tmp3 x6
-#define tmp4 x7
-#define zeroones x8
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. A faster check
- (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
- false hits for characters 129..255. */
+#define srcin x0
+#define len x0
+
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define maskv v0
+#define maskd d0
+#define dataq1 q1
+#define dataq2 q2
+#define datav1 v1
+#define datav2 v2
+#define tmp x2
+#define tmpw w2
+#define synd x3
+#define shift x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+ (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+ byte is zero, and can be done in parallel across the entire word. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
#ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
#else
# define MIN_PAGE_SIZE 4096
#endif
- /* Since strings are short on average, we check the first 16 bytes
- of the string for a NUL character. In order to do an unaligned ldp
- safely we have to do a page cross check first. If there is a NUL
- byte we calculate the length from the 2 8-byte words using
- conditional select to reduce branch mispredictions (it is unlikely
- __strlen_aarch64 will be repeatedly called on strings with the same length).
-
- If the string is longer than 16 bytes, we align src so don't need
- further page cross checks, and process 32 bytes per iteration
- using the fast NUL check. If we encounter non-ASCII characters,
- fallback to a second loop using the full NUL check.
-
- If the page cross check fails, we read 16 bytes from an aligned
- address, remove any characters before the string, and continue
- in the main loop using aligned loads. Since strings crossing a
- page in the first 16 bytes are rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
- AArch64 systems have a minimum page size of 4k. We don't bother
- checking for larger page sizes - the cost of setting up the correct
- page size is just not worth the extra gain from a small reduction in
- the cases taking the slow path. Note that we only care about
- whether the first fetch, which may be misaligned, crosses a page
- boundary. */
+/* Core algorithm:
+
+ Since strings are short on average, we check the first 32 bytes of the
+ string for a NUL character without aligning the string. In order to use
+ unaligned loads safely we must do a page cross check first.
+
+ If there is a NUL byte we calculate the length from the 2 8-byte words
+ using conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 32 bytes, align src so we don't need further
+ page cross checks, and process 32 bytes per iteration using a fast SIMD
+ loop.
+
+ If the page cross check fails, we read 32 bytes from an aligned address,
+ and ignore any characters before the string. If it contains a NUL
+ character, return the length, if not, continue in the main loop. */
ENTRY (__strlen_aarch64)
+ PTR_ARG (0)
and tmp1, srcin, MIN_PAGE_SIZE - 1
- mov zeroones, REP8_01
- cmp tmp1, MIN_PAGE_SIZE - 16
- b.gt L(page_cross)
+ cmp tmp1, MIN_PAGE_SIZE - 32
+ b.hi L(page_cross)
+
+ /* Look for a NUL byte in the first 16 bytes. */
ldp data1, data2, [srcin]
+ mov zeroones, REP8_01
+
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly.
@@ -94,113 +98,103 @@ ENTRY (__strlen_aarch64)
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(main_loop_entry)
+ b.eq L(bytes16_31)
- /* Enter with C = has_nul1 == 0. */
+ /* Find the exact offset of the first NUL byte in the first 16 bytes
+ from the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
- clz tmp1, has_nul1
csel len, xzr, len, cc
+ clz tmp1, has_nul1
add len, len, tmp1, lsr 3
ret
- /* The inner loop processes 32 bytes per iteration and uses the fast
- NUL check. If we encounter non-ASCII characters, use a second
- loop with the accurate NUL check. */
- .p2align 4
-L(main_loop_entry):
- bic src, srcin, 15
- sub src, src, 16
-L(main_loop):
- ldp data1, data2, [src, 32]!
-L(page_cross_entry):
- sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- bne 1f
- ldp data1, data2, [src, 16]
+ .p2align 3
+ /* Look for a NUL byte at offset 16..31 in the string. */
+L(bytes16_31):
+ ldp data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- beq L(main_loop)
- add src, src, 16
-1:
- /* The fast check failed, so do the slower, accurate NUL check. */
orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
+ b.eq L(loop_entry)
- /* Enter with C = has_nul1 == 0. */
-L(tail):
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul1/2 directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- csel data1, data1, data2, cc
- rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- bic has_nul1, tmp1, tmp2
-#else
+ /* Find the exact offset of the first NUL byte at offset 16..31 from
+ the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
-#endif
- sub len, src, srcin
+ mov len, 24
rev has_nul1, has_nul1
- add tmp2, len, 8
+ mov tmp3, 16
clz tmp1, has_nul1
- csel len, len, tmp2, cc
+ csel len, tmp3, len, cc
add len, len, tmp1, lsr 3
ret
-L(nonascii_loop):
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- bne L(tail)
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
- b L(tail)
+L(loop_entry):
+ bic src, srcin, 31
- /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
- srcin to 0x7f, so we ignore any NUL bytes before the string.
- Then continue in the aligned loop. */
-L(page_cross):
- bic src, srcin, 15
- ldp data1, data2, [src]
- lsl tmp1, srcin, 3
- mov tmp4, -1
+ .p2align 5
+L(loop):
+ ldp dataq1, dataq2, [src, 32]!
+ uminp maskv.16b, datav1.16b, datav2.16b
+ uminp maskv.16b, maskv.16b, maskv.16b
+ cmeq maskv.8b, maskv.8b, 0
+ fmov synd, maskd
+ cbz synd, L(loop)
+
+ /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
+ cmeq maskv.16b, datav1.16b, 0
+ sub len, src, srcin
+ tst synd, 0xffffffff
+ b.ne 1f
+ cmeq maskv.16b, datav2.16b, 0
+ add len, len, 16
+1:
+ /* Generate a bitmask and compute correct byte offset. */
#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+ bic maskv.8h, 0xf0
#else
- /* Little-endian. Early bytes are at LSB. */
- lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+ bic maskv.8h, 0x0f, lsl 8
+#endif
+ umaxp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+#ifndef __AARCH64EB__
+ rbit synd, synd
#endif
- orr tmp1, tmp1, REP8_80
- orn data1, data1, tmp1
- orn tmp2, data2, tmp1
- tst srcin, 8
- csel data1, data1, tmp4, eq
- csel data2, data2, tmp2, eq
- b L(page_cross_entry)
+ clz tmp, synd
+ add len, len, tmp, lsr 2
+ ret
+
+ .p2align 4
+
+L(page_cross):
+ bic src, srcin, 31
+ mov tmpw, 0x0c03
+ movk tmpw, 0xc030, lsl 16
+ ld1 {datav1.16b, datav2.16b}, [src]
+ dup maskv.4s, tmpw
+ cmeq datav1.16b, datav1.16b, 0
+ cmeq datav2.16b, datav2.16b, 0
+ and datav1.16b, datav1.16b, maskv.16b
+ and datav2.16b, datav2.16b, maskv.16b
+ addp maskv.16b, datav1.16b, datav2.16b
+ addp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+ lsl shift, srcin, 1
+ lsr synd, synd, shift
+ cbz synd, L(loop)
+
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 1
+ ret
END (__strlen_aarch64)