aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTreehugger Robot <treehugger-gerrit@google.com>2020-10-29 18:27:22 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2020-10-29 18:27:22 +0000
commit614c18275dcd0ad4a6a1c17122491d5d4ec2cdf8 (patch)
tree5a0a970a664c3e5c888d0458b1d228b5ded9af8e
parent0079b9694aaa9d2d9f26ca74f2439d661782b730 (diff)
parente82c0ef966757b0c5885cbcc863cbce48493c598 (diff)
downloadarm-optimized-routines-614c18275dcd0ad4a6a1c17122491d5d4ec2cdf8.tar.gz
Merge "Upgrade arm-optimized-routines to 0f4ae0c5b561de25acb10130fd5e473ec038f89d"
-rw-r--r--METADATA6
-rw-r--r--README2
-rw-r--r--string/Dir.mk5
-rw-r--r--string/aarch64/memcpy-advsimd.S7
-rw-r--r--string/aarch64/strcmp-mte.S16
-rw-r--r--string/aarch64/strlen.S274
-rw-r--r--string/aarch64/strncmp-mte.S6
-rw-r--r--string/arm/memcpy.S22
-rw-r--r--string/bench/memcpy.c34
-rw-r--r--string/bench/strlen.c221
10 files changed, 423 insertions, 170 deletions
diff --git a/METADATA b/METADATA
index 9f448ed..a8d7b6f 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
type: GIT
value: "https://github.com/ARM-software/optimized-routines.git"
}
- version: "ef907c7a799ae7fd8662fb56a6c5e9437508f645"
+ version: "0f4ae0c5b561de25acb10130fd5e473ec038f89d"
license_type: NOTICE
last_upgrade_date {
year: 2020
- month: 6
- day: 1
+ month: 10
+ day: 28
}
}
diff --git a/README b/README
index 6cde5cd..041aa1a 100644
--- a/README
+++ b/README
@@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code
to projects that require copyright assignment is possible.
Regular quarterly releases are tagged as vYY.MM, the latest
-release is v20.05.
+release is v20.08.
Source code layout:
diff --git a/string/Dir.mk b/string/Dir.mk
index ae7c673..7817357 100644
--- a/string/Dir.mk
+++ b/string/Dir.mk
@@ -39,7 +39,9 @@ string-tests := \
build/bin/test/strnlen \
build/bin/test/strncmp
-string-benches := build/bin/bench/memcpy
+string-benches := \
+ build/bin/bench/memcpy \
+ build/bin/bench/strlen
string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs)))
string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs)))
@@ -95,6 +97,7 @@ check-string: $(string-tests-out)
! grep FAIL $^
bench-string: $(string-benches)
+ $(EMULATOR) build/bin/bench/strlen
$(EMULATOR) build/bin/bench/memcpy
install-string: \
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index 3004179..23545a3 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -179,12 +179,13 @@ L(copy_long_backwards):
b.ls L(copy64_from_start)
L(loop64_backwards):
- stp A_q, B_q, [dstend, -32]
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
ldp A_q, B_q, [srcend, -96]
- stp C_q, D_q, [dstend, -64]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
ldp C_q, D_q, [srcend, -128]
sub srcend, srcend, 64
- sub dstend, dstend, 64
subs count, count, 64
b.hi L(loop64_backwards)
diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S
index 8f2abc4..1b6db42 100644
--- a/string/aarch64/strcmp-mte.S
+++ b/string/aarch64/strcmp-mte.S
@@ -99,6 +99,8 @@ L(end):
sub result, data1, data2, lsr 56
ret
+ .p2align 4
+
L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
@@ -127,17 +129,18 @@ L(do_misaligned):
b.ne L(do_misaligned)
L(src1_aligned):
- lsl shift, src2, 3
+ neg shift, src2, lsl 3
bic src2, src2, 7
ldr data3, [src2], 8
#ifdef __AARCH64EB__
rev data3, data3
#endif
+ lsr tmp, zeroones, shift
+ orr data3, data3, tmp
sub has_nul, data3, zeroones
orr tmp, data3, REP8_7f
- bic has_nul, has_nul, tmp
- lsr tmp, has_nul, shift
- cbnz tmp, L(tail)
+ bics has_nul, has_nul, tmp
+ b.ne L(tail)
sub off1, src2, src1
@@ -156,8 +159,7 @@ L(loop_unaligned):
ccmp data1, data2, 0, eq
b.eq L(loop_unaligned)
- neg tmp, shift
- lsl tmp, has_nul, tmp
+ lsl tmp, has_nul, shift
#ifdef __AARCH64EB__
rev tmp, tmp
#endif
@@ -166,6 +168,7 @@ L(loop_unaligned):
cbnz syndrome, L(end)
L(tail):
ldr data1, [src1]
+ neg shift, shift
lsr data2, data3, shift
lsr has_nul, has_nul, shift
#ifdef __AARCH64EB__
@@ -180,6 +183,5 @@ L(done):
sub result, data1, data2
ret
-
END (__strcmp_aarch64_mte)
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 3aa444b..b20eaeb 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -1,84 +1,87 @@
/*
- * strlen - calculate the length of a string
+ * strlen - calculate the length of a string.
*
- * Copyright (c) 2013, Arm Limited.
+ * Copyright (c) 2020, Arm Limited.
* SPDX-License-Identifier: MIT
*/
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
*/
#include "../asmdefs.h"
-/* To test the page crossing code path more thoroughly, compile with
- -DTEST_PAGE_CROSS - this will force all calls through the slower
- entry path. This option is not intended for production use. */
-
-/* Arguments and results. */
-#define srcin x0
-#define len x0
-
-/* Locals and temporaries. */
-#define src x1
-#define data1 x2
-#define data2 x3
-#define has_nul1 x4
-#define has_nul2 x5
-#define tmp1 x4
-#define tmp2 x5
-#define tmp3 x6
-#define tmp4 x7
-#define zeroones x8
-
- /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
- (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
- can be done in parallel across the entire word. A faster check
- (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
- false hits for characters 129..255. */
+#define srcin x0
+#define len x0
+
+#define src x1
+#define data1 x2
+#define data2 x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1 x4
+#define tmp2 x5
+#define tmp3 x6
+#define tmp4 x7
+#define zeroones x8
+
+#define maskv v0
+#define maskd d0
+#define dataq1 q1
+#define dataq2 q2
+#define datav1 v1
+#define datav2 v2
+#define tmp x2
+#define tmpw w2
+#define synd x3
+#define shift x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+ (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+ byte is zero, and can be done in parallel across the entire word. */
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
+
+/* To test the page crossing code path more thoroughly, compile with
+ -DTEST_PAGE_CROSS - this will force all calls through the slower
+ entry path. This option is not intended for production use. */
#ifdef TEST_PAGE_CROSS
-# define MIN_PAGE_SIZE 15
+# define MIN_PAGE_SIZE 32
#else
# define MIN_PAGE_SIZE 4096
#endif
- /* Since strings are short on average, we check the first 16 bytes
- of the string for a NUL character. In order to do an unaligned ldp
- safely we have to do a page cross check first. If there is a NUL
- byte we calculate the length from the 2 8-byte words using
- conditional select to reduce branch mispredictions (it is unlikely
- __strlen_aarch64 will be repeatedly called on strings with the same length).
-
- If the string is longer than 16 bytes, we align src so don't need
- further page cross checks, and process 32 bytes per iteration
- using the fast NUL check. If we encounter non-ASCII characters,
- fallback to a second loop using the full NUL check.
-
- If the page cross check fails, we read 16 bytes from an aligned
- address, remove any characters before the string, and continue
- in the main loop using aligned loads. Since strings crossing a
- page in the first 16 bytes are rare (probability of
- 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
-
- AArch64 systems have a minimum page size of 4k. We don't bother
- checking for larger page sizes - the cost of setting up the correct
- page size is just not worth the extra gain from a small reduction in
- the cases taking the slow path. Note that we only care about
- whether the first fetch, which may be misaligned, crosses a page
- boundary. */
+/* Core algorithm:
+
+ Since strings are short on average, we check the first 32 bytes of the
+ string for a NUL character without aligning the string. In order to use
+ unaligned loads safely we must do a page cross check first.
+
+ If there is a NUL byte we calculate the length from the 2 8-byte words
+ using conditional select to reduce branch mispredictions (it is unlikely
+ strlen will be repeatedly called on strings with the same length).
+
+ If the string is longer than 32 bytes, align src so we don't need further
+ page cross checks, and process 32 bytes per iteration using a fast SIMD
+ loop.
+
+ If the page cross check fails, we read 32 bytes from an aligned address,
+ and ignore any characters before the string. If it contains a NUL
+ character, return the length, if not, continue in the main loop. */
ENTRY (__strlen_aarch64)
and tmp1, srcin, MIN_PAGE_SIZE - 1
- mov zeroones, REP8_01
- cmp tmp1, MIN_PAGE_SIZE - 16
- b.gt L(page_cross)
+ cmp tmp1, MIN_PAGE_SIZE - 32
+ b.hi L(page_cross)
+
+ /* Look for a NUL byte in the first 16 bytes. */
ldp data1, data2, [srcin]
+ mov zeroones, REP8_01
+
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul1/2 directly.
@@ -94,114 +97,103 @@ ENTRY (__strlen_aarch64)
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(main_loop_entry)
+ b.eq L(bytes16_31)
- /* Enter with C = has_nul1 == 0. */
+ /* Find the exact offset of the first NUL byte in the first 16 bytes
+ from the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev has_nul1, has_nul1
- clz tmp1, has_nul1
csel len, xzr, len, cc
+ clz tmp1, has_nul1
add len, len, tmp1, lsr 3
ret
- /* The inner loop processes 32 bytes per iteration and uses the fast
- NUL check. If we encounter non-ASCII characters, use a second
- loop with the accurate NUL check. */
- .p2align 4
-L(main_loop_entry):
- bic src, srcin, 15
- sub src, src, 16
-L(main_loop):
- ldp data1, data2, [src, 32]!
-L(page_cross_entry):
- sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- bne 1f
- ldp data1, data2, [src, 16]
+ .p2align 3
+ /* Look for a NUL byte at offset 16..31 in the string. */
+L(bytes16_31):
+ ldp data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
sub tmp1, data1, zeroones
- sub tmp3, data2, zeroones
- orr tmp2, tmp1, tmp3
- tst tmp2, zeroones, lsl 7
- beq L(main_loop)
- add src, src, 16
-1:
- /* The fast check failed, so do the slower, accurate NUL check. */
orr tmp2, data1, REP8_7f
+ sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
+ b.eq L(loop_entry)
- /* Enter with C = has_nul1 == 0. */
-L(tail):
-#ifdef __AARCH64EB__
- /* For big-endian, carry propagation (if the final byte in the
- string is 0x01) means we cannot use has_nul1/2 directly. The
- easiest way to get the correct byte is to byte-swap the data
- and calculate the syndrome a second time. */
- csel data1, data1, data2, cc
- rev data1, data1
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- bic has_nul1, tmp1, tmp2
-#else
+ /* Find the exact offset of the first NUL byte at offset 16..31 from
+ the string start. Enter with C = has_nul1 == 0. */
csel has_nul1, has_nul1, has_nul2, cc
-#endif
- sub len, src, srcin
+ mov len, 24
rev has_nul1, has_nul1
- add tmp2, len, 8
+ mov tmp3, 16
clz tmp1, has_nul1
- csel len, len, tmp2, cc
+ csel len, tmp3, len, cc
add len, len, tmp1, lsr 3
ret
-L(nonascii_loop):
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- bne L(tail)
- ldp data1, data2, [src, 16]!
- sub tmp1, data1, zeroones
- orr tmp2, data1, REP8_7f
- sub tmp3, data2, zeroones
- orr tmp4, data2, REP8_7f
- bics has_nul1, tmp1, tmp2
- bic has_nul2, tmp3, tmp4
- ccmp has_nul2, 0, 0, eq
- beq L(nonascii_loop)
- b L(tail)
+L(loop_entry):
+ bic src, srcin, 31
- /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
- srcin to 0x7f, so we ignore any NUL bytes before the string.
- Then continue in the aligned loop. */
-L(page_cross):
- bic src, srcin, 15
- ldp data1, data2, [src]
- lsl tmp1, srcin, 3
- mov tmp4, -1
+ .p2align 5
+L(loop):
+ ldp dataq1, dataq2, [src, 32]!
+ uminp maskv.16b, datav1.16b, datav2.16b
+ uminp maskv.16b, maskv.16b, maskv.16b
+ cmeq maskv.8b, maskv.8b, 0
+ fmov synd, maskd
+ cbz synd, L(loop)
+
+ /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
+ cmeq maskv.16b, datav1.16b, 0
+ sub len, src, srcin
+ tst synd, 0xffffffff
+ b.ne 1f
+ cmeq maskv.16b, datav2.16b, 0
+ add len, len, 16
+1:
+ /* Generate a bitmask and compute correct byte offset. */
#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+ bic maskv.8h, 0xf0
#else
- /* Little-endian. Early bytes are at LSB. */
- lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
+ bic maskv.8h, 0x0f, lsl 8
#endif
- orr tmp1, tmp1, REP8_80
- orn data1, data1, tmp1
- orn tmp2, data2, tmp1
- tst srcin, 8
- csel data1, data1, tmp4, eq
- csel data2, data2, tmp2, eq
- b L(page_cross_entry)
+ umaxp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ clz tmp, synd
+ add len, len, tmp, lsr 2
+ ret
-END (__strlen_aarch64)
+ .p2align 4
+L(page_cross):
+ bic src, srcin, 31
+ mov tmpw, 0x0c03
+ movk tmpw, 0xc030, lsl 16
+ ld1 {datav1.16b, datav2.16b}, [src]
+ dup maskv.4s, tmpw
+ cmeq datav1.16b, datav1.16b, 0
+ cmeq datav2.16b, datav2.16b, 0
+ and datav1.16b, datav1.16b, maskv.16b
+ and datav2.16b, datav2.16b, maskv.16b
+ addp maskv.16b, datav1.16b, datav2.16b
+ addp maskv.16b, maskv.16b, maskv.16b
+ fmov synd, maskd
+ lsl shift, srcin, 1
+ lsr synd, synd, shift
+ cbz synd, L(loop)
+
+ rbit synd, synd
+ clz len, synd
+ lsr len, len, 1
+ ret
+
+END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S
index b7e3914..46765d6 100644
--- a/string/aarch64/strncmp-mte.S
+++ b/string/aarch64/strncmp-mte.S
@@ -167,10 +167,10 @@ L(mutual_align):
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
ldr data2, [src2], #8
mov tmp2, #~0
- and count, count, #0x3f
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
- /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
- add limit, limit, count
+ /* Adjust the limit and ensure it doesn't overflow. */
+ adds limit, limit, count
+ csinv limit, limit, xzr, lo
orr data1, data1, tmp2
orr data2, data2, tmp2
b L(start_realigned)
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index aab78a2..4a34142 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -124,7 +124,7 @@ ENTRY (__memcpy_arm)
mov dst, dstin /* Preserve dstin, we need to return it. */
cmp count, #64
- bge L(cpy_not_short)
+ bhs L(cpy_not_short)
/* Deal with small copies quickly by dropping straight into the
exit block. */
@@ -239,10 +239,10 @@ L(cpy_not_short):
1:
subs tmp2, count, #64 /* Use tmp2 for count. */
- blt L(tail63aligned)
+ blo L(tail63aligned)
cmp tmp2, #512
- bge L(cpy_body_long)
+ bhs L(cpy_body_long)
L(cpy_body_medium): /* Count in tmp2. */
#ifdef USE_VFP
@@ -266,7 +266,7 @@ L(cpy_body_medium): /* Count in tmp2. */
add src, src, #64
vstr d1, [dst, #56]
add dst, dst, #64
- bge 1b
+ bhs 1b
tst tmp2, #0x3f
beq L(done)
@@ -312,7 +312,7 @@ L(tail63aligned): /* Count in tmp2. */
ldrd A_l, A_h, [src, #64]!
strd A_l, A_h, [dst, #64]!
subs tmp2, tmp2, #64
- bge 1b
+ bhs 1b
tst tmp2, #0x3f
bne 1f
ldr tmp2,[sp], #FRAME_SIZE
@@ -383,7 +383,7 @@ L(cpy_body_long): /* Count in tmp2. */
add src, src, #32
subs tmp2, tmp2, #prefetch_lines * 64 * 2
- blt 2f
+ blo 2f
1:
cpy_line_vfp d3, 0
cpy_line_vfp d4, 64
@@ -395,7 +395,7 @@ L(cpy_body_long): /* Count in tmp2. */
add dst, dst, #2 * 64
add src, src, #2 * 64
subs tmp2, tmp2, #prefetch_lines * 64
- bge 1b
+ bhs 1b
2:
cpy_tail_vfp d3, 0
@@ -499,15 +499,15 @@ L(cpy_notaligned):
1:
pld [src, #(3 * 64)]
subs count, count, #64
- ldrmi tmp2, [sp], #FRAME_SIZE
- bmi L(tail63unaligned)
+ ldrlo tmp2, [sp], #FRAME_SIZE
+ blo L(tail63unaligned)
pld [src, #(4 * 64)]
#ifdef USE_NEON
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
subs count, count, #64
- bmi 2f
+ blo 2f
1:
pld [src, #(4 * 64)]
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
@@ -515,7 +515,7 @@ L(cpy_notaligned):
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
vld1.8 {d4-d7}, [src]!
subs count, count, #64
- bpl 1b
+ bhs 1b
2:
vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index 967507b..d5d4ea7 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -221,6 +221,40 @@ int main (void)
printf ("\n");
}
+ printf ("\nUnaligned forwards memmove:\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s (B/ns) ", funtab[f].name);
+
+ for (int size = 1024; size <= 32768; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ funtab[f].fun (a, a + 256 + (i & 31), size);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+ }
+
+
+ printf ("\nUnaligned backwards memmove:\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s (B/ns) ", funtab[f].name);
+
+ for (int size = 1024; size <= 32768; size *= 2)
+ {
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ funtab[f].fun (a + 256 + (i & 31), a, size);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+ }
return 0;
}
diff --git a/string/bench/strlen.c b/string/bench/strlen.c
new file mode 100644
index 0000000..cc0f04b
--- /dev/null
+++ b/string/bench/strlen.c
@@ -0,0 +1,221 @@
+/*
+ * strlen benchmark.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "stringlib.h"
+#include "benchlib.h"
+
+#define ITERS 2000
+#define ITERS2 20000000
+#define ITERS3 2000000
+#define NUM_STRLEN 16384
+
+#define MAX_ALIGN 32
+#define MAX_STRLEN 256
+
+static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096)));
+
+#define F(x, mte) {#x, x, mte},
+
+static const struct fun
+{
+ const char *name;
+ size_t (*fun) (const char *s);
+ int test_mte;
+} funtab[] = {
+ // clang-format off
+ F(strlen, 0)
+#if __aarch64__
+ F(__strlen_aarch64, 0)
+ F(__strlen_aarch64_mte, 1)
+# if __ARM_FEATURE_SVE
+ F(__strlen_aarch64_sve, 1)
+# endif
+#elif __arm__
+# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2
+ F(__strlen_armv6t2, 0)
+# endif
+#endif
+ {0, 0, 0}
+ // clang-format on
+};
+#undef F
+
+static uint16_t strlen_tests[NUM_STRLEN];
+
+typedef struct { uint16_t size; uint16_t freq; } freq_data_t;
+typedef struct { uint8_t align; uint16_t freq; } align_data_t;
+
+#define SIZE_NUM 65536
+#define SIZE_MASK (SIZE_NUM - 1)
+static uint8_t strlen_len_arr[SIZE_NUM];
+
+/* Frequency data for strlen sizes up to 128 based on SPEC2017. */
+static freq_data_t strlen_len_freq[] =
+{
+ { 12,22671}, { 18,12834}, { 13, 9555}, { 6, 6348}, { 17, 6095}, { 11, 2115},
+ { 10, 1335}, { 7, 814}, { 2, 646}, { 9, 483}, { 8, 471}, { 16, 418},
+ { 4, 390}, { 1, 388}, { 5, 233}, { 3, 204}, { 0, 79}, { 14, 79},
+ { 15, 69}, { 26, 36}, { 22, 35}, { 31, 24}, { 32, 24}, { 19, 21},
+ { 25, 17}, { 28, 15}, { 21, 14}, { 33, 14}, { 20, 13}, { 24, 9},
+ { 29, 9}, { 30, 9}, { 23, 7}, { 34, 7}, { 27, 6}, { 44, 5},
+ { 42, 4}, { 45, 3}, { 47, 3}, { 40, 2}, { 41, 2}, { 43, 2},
+ { 58, 2}, { 78, 2}, { 36, 2}, { 48, 1}, { 52, 1}, { 60, 1},
+ { 64, 1}, { 56, 1}, { 76, 1}, { 68, 1}, { 80, 1}, { 84, 1},
+ { 72, 1}, { 86, 1}, { 35, 1}, { 39, 1}, { 50, 1}, { 38, 1},
+ { 37, 1}, { 46, 1}, { 98, 1}, {102, 1}, {128, 1}, { 51, 1},
+ {107, 1}, { 0, 0}
+};
+
+#define ALIGN_NUM 1024
+#define ALIGN_MASK (ALIGN_NUM - 1)
+static uint8_t strlen_align_arr[ALIGN_NUM];
+
+/* Alignment data for strlen based on SPEC2017. */
+static align_data_t string_align_freq[] =
+{
+ {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0}
+};
+
+static void
+init_strlen_distribution (void)
+{
+ int i, j, freq, size, n;
+
+ for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++)
+ for (j = 0, size = strlen_len_freq[i].size; j < freq; j++)
+ strlen_len_arr[n++] = size;
+ assert (n == SIZE_NUM);
+
+ for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++)
+ for (j = 0, size = string_align_freq[i].align; j < freq; j++)
+ strlen_align_arr[n++] = size;
+ assert (n == ALIGN_NUM);
+}
+
+static void
+init_strlen_tests (void)
+{
+ uint16_t index[MAX_ALIGN];
+
+ memset (a, 'x', sizeof (a));
+
+ /* Create indices for strings at all alignments. */
+ for (int i = 0; i < MAX_ALIGN; i++)
+ {
+ index[i] = i * (MAX_STRLEN + 1);
+ a[index[i] + MAX_STRLEN] = 0;
+ }
+
+ /* Create a random set of strlen input strings using the string length
+ and alignment distributions. */
+ for (int n = 0; n < NUM_STRLEN; n++)
+ {
+ int align = strlen_align_arr[rand32 (0) & ALIGN_MASK];
+ int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK];
+
+ strlen_tests[n] =
+ index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len;
+ }
+}
+
+static volatile size_t maskv = 0;
+
+int main (void)
+{
+ rand32 (0x12345678);
+ init_strlen_distribution ();
+ init_strlen_tests ();
+
+ printf ("\nRandom strlen (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ size_t res = 0, strlen_size = 0, mask = maskv;
+ printf ("%22s ", funtab[f].name);
+
+ for (int c = 0; c < NUM_STRLEN; c++)
+ strlen_size += funtab[f].fun (a + strlen_tests[c]);
+ strlen_size *= ITERS;
+
+ /* Measure latency of strlen result with (res & mask). */
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS; i++)
+ for (int c = 0; c < NUM_STRLEN; c++)
+ res = funtab[f].fun (a + strlen_tests[c] + (res & mask));
+ t = clock_get_ns () - t;
+ printf ("%.2f\n", (double)strlen_size / t);
+ }
+
+ printf ("\nSmall aligned strlen (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ for (int size = 1; size <= 64; size *= 2)
+ {
+ memset (a, 'x', size);
+ a[size - 1] = 0;
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ funtab[f].fun (a);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("\nSmall unaligned strlen (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ int align = 9;
+ for (int size = 1; size <= 64; size *= 2)
+ {
+ memset (a + align, 'x', size);
+ a[align + size - 1] = 0;
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS2; i++)
+ funtab[f].fun (a + align);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("\nMedium strlen (bytes/ns):\n");
+ for (int f = 0; funtab[f].name != 0; f++)
+ {
+ printf ("%22s ", funtab[f].name);
+
+ for (int size = 128; size <= 4096; size *= 2)
+ {
+ memset (a, 'x', size);
+ a[size - 1] = 0;
+
+ uint64_t t = clock_get_ns ();
+ for (int i = 0; i < ITERS3; i++)
+ funtab[f].fun (a);
+ t = clock_get_ns () - t;
+ printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024,
+ size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t);
+ }
+ printf ("\n");
+ }
+
+ printf ("\n");
+
+ return 0;
+}