diff options
author | Treehugger Robot <treehugger-gerrit@google.com> | 2020-10-29 19:49:03 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2020-10-29 19:49:03 +0000 |
commit | de66eb189d01931baaec370767da29976a4ef88e (patch) | |
tree | 5a0a970a664c3e5c888d0458b1d228b5ded9af8e | |
parent | e49b9c0109e3c47a132601b22b4cc26b917f5d20 (diff) | |
parent | a0f30b985085946b260e48e7083334bf52683b3c (diff) | |
download | arm-optimized-routines-de66eb189d01931baaec370767da29976a4ef88e.tar.gz |
Merge "Upgrade arm-optimized-routines to 0f4ae0c5b561de25acb10130fd5e473ec038f89d" am: 614c18275d am: e923b18d84 am: a0f30b9850
Original change: https://android-review.googlesource.com/c/platform/external/arm-optimized-routines/+/1479384
Change-Id: I4c49e4cc4b51d315317a9226992cd850c71c504f
-rw-r--r-- | METADATA | 6 | ||||
-rw-r--r-- | README | 2 | ||||
-rw-r--r-- | string/Dir.mk | 5 | ||||
-rw-r--r-- | string/aarch64/memcpy-advsimd.S | 7 | ||||
-rw-r--r-- | string/aarch64/strcmp-mte.S | 16 | ||||
-rw-r--r-- | string/aarch64/strlen.S | 274 | ||||
-rw-r--r-- | string/aarch64/strncmp-mte.S | 6 | ||||
-rw-r--r-- | string/arm/memcpy.S | 22 | ||||
-rw-r--r-- | string/bench/memcpy.c | 34 | ||||
-rw-r--r-- | string/bench/strlen.c | 221 |
10 files changed, 423 insertions, 170 deletions
@@ -9,11 +9,11 @@ third_party { type: GIT value: "https://github.com/ARM-software/optimized-routines.git" } - version: "ef907c7a799ae7fd8662fb56a6c5e9437508f645" + version: "0f4ae0c5b561de25acb10130fd5e473ec038f89d" license_type: NOTICE last_upgrade_date { year: 2020 - month: 6 - day: 1 + month: 10 + day: 28 } } @@ -9,7 +9,7 @@ contributor-agreement.pdf. This is needed so upstreaming code to projects that require copyright assignment is possible. Regular quarterly releases are tagged as vYY.MM, the latest -release is v20.05. +release is v20.08. Source code layout: diff --git a/string/Dir.mk b/string/Dir.mk index ae7c673..7817357 100644 --- a/string/Dir.mk +++ b/string/Dir.mk @@ -39,7 +39,9 @@ string-tests := \ build/bin/test/strnlen \ build/bin/test/strncmp -string-benches := build/bin/bench/memcpy +string-benches := \ + build/bin/bench/memcpy \ + build/bin/bench/strlen string-lib-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-lib-srcs))) string-test-objs := $(patsubst $(S)/%,$(B)/%.o,$(basename $(string-test-srcs))) @@ -95,6 +97,7 @@ check-string: $(string-tests-out) ! grep FAIL $^ bench-string: $(string-benches) + $(EMULATOR) build/bin/bench/strlen $(EMULATOR) build/bin/bench/memcpy install-string: \ diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S index 3004179..23545a3 100644 --- a/string/aarch64/memcpy-advsimd.S +++ b/string/aarch64/memcpy-advsimd.S @@ -179,12 +179,13 @@ L(copy_long_backwards): b.ls L(copy64_from_start) L(loop64_backwards): - stp A_q, B_q, [dstend, -32] + str B_q, [dstend, -16] + str A_q, [dstend, -32] ldp A_q, B_q, [srcend, -96] - stp C_q, D_q, [dstend, -64] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! ldp C_q, D_q, [srcend, -128] sub srcend, srcend, 64 - sub dstend, dstend, 64 subs count, count, 64 b.hi L(loop64_backwards) diff --git a/string/aarch64/strcmp-mte.S b/string/aarch64/strcmp-mte.S index 8f2abc4..1b6db42 100644 --- a/string/aarch64/strcmp-mte.S +++ b/string/aarch64/strcmp-mte.S @@ -99,6 +99,8 @@ L(end): sub result, data1, data2, lsr 56 ret + .p2align 4 + L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off @@ -127,17 +129,18 @@ L(do_misaligned): b.ne L(do_misaligned) L(src1_aligned): - lsl shift, src2, 3 + neg shift, src2, lsl 3 bic src2, src2, 7 ldr data3, [src2], 8 #ifdef __AARCH64EB__ rev data3, data3 #endif + lsr tmp, zeroones, shift + orr data3, data3, tmp sub has_nul, data3, zeroones orr tmp, data3, REP8_7f - bic has_nul, has_nul, tmp - lsr tmp, has_nul, shift - cbnz tmp, L(tail) + bics has_nul, has_nul, tmp + b.ne L(tail) sub off1, src2, src1 @@ -156,8 +159,7 @@ L(loop_unaligned): ccmp data1, data2, 0, eq b.eq L(loop_unaligned) - neg tmp, shift - lsl tmp, has_nul, tmp + lsl tmp, has_nul, shift #ifdef __AARCH64EB__ rev tmp, tmp #endif @@ -166,6 +168,7 @@ L(loop_unaligned): cbnz syndrome, L(end) L(tail): ldr data1, [src1] + neg shift, shift lsr data2, data3, shift lsr has_nul, has_nul, shift #ifdef __AARCH64EB__ @@ -180,6 +183,5 @@ L(done): sub result, data1, data2 ret - END (__strcmp_aarch64_mte) diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index 3aa444b..b20eaeb 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -1,84 +1,87 @@ /* - * strlen - calculate the length of a string + * strlen - calculate the length of a string. * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2020, Arm Limited. * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses, min page size 4k. + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * Not MTE compatible. */ #include "../asmdefs.h" -/* To test the page crossing code path more thoroughly, compile with - -DTEST_PAGE_CROSS - this will force all calls through the slower - entry path. This option is not intended for production use. */ - -/* Arguments and results. */ -#define srcin x0 -#define len x0 - -/* Locals and temporaries. */ -#define src x1 -#define data1 x2 -#define data2 x3 -#define has_nul1 x4 -#define has_nul2 x5 -#define tmp1 x4 -#define tmp2 x5 -#define tmp3 x6 -#define tmp4 x7 -#define zeroones x8 - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. A faster check - (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives - false hits for characters 129..255. */ +#define srcin x0 +#define len x0 + +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + +#define maskv v0 +#define maskd d0 +#define dataq1 q1 +#define dataq2 q2 +#define datav1 v1 +#define datav2 v2 +#define tmp x2 +#define tmpw w2 +#define synd x3 +#define shift x4 + +/* For the first 32 bytes, NUL detection works on the principle that + (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a + byte is zero, and can be done in parallel across the entire word. */ #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ #ifdef TEST_PAGE_CROSS -# define MIN_PAGE_SIZE 15 +# define MIN_PAGE_SIZE 32 #else # define MIN_PAGE_SIZE 4096 #endif - /* Since strings are short on average, we check the first 16 bytes - of the string for a NUL character. In order to do an unaligned ldp - safely we have to do a page cross check first. If there is a NUL - byte we calculate the length from the 2 8-byte words using - conditional select to reduce branch mispredictions (it is unlikely - __strlen_aarch64 will be repeatedly called on strings with the same length). - - If the string is longer than 16 bytes, we align src so don't need - further page cross checks, and process 32 bytes per iteration - using the fast NUL check. If we encounter non-ASCII characters, - fallback to a second loop using the full NUL check. - - If the page cross check fails, we read 16 bytes from an aligned - address, remove any characters before the string, and continue - in the main loop using aligned loads. Since strings crossing a - page in the first 16 bytes are rare (probability of - 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. - - AArch64 systems have a minimum page size of 4k. We don't bother - checking for larger page sizes - the cost of setting up the correct - page size is just not worth the extra gain from a small reduction in - the cases taking the slow path. Note that we only care about - whether the first fetch, which may be misaligned, crosses a page - boundary. */ +/* Core algorithm: + + Since strings are short on average, we check the first 32 bytes of the + string for a NUL character without aligning the string. In order to use + unaligned loads safely we must do a page cross check first. + + If there is a NUL byte we calculate the length from the 2 8-byte words + using conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 32 bytes, align src so we don't need further + page cross checks, and process 32 bytes per iteration using a fast SIMD + loop. + + If the page cross check fails, we read 32 bytes from an aligned address, + and ignore any characters before the string. If it contains a NUL + character, return the length, if not, continue in the main loop. */ ENTRY (__strlen_aarch64) and tmp1, srcin, MIN_PAGE_SIZE - 1 - mov zeroones, REP8_01 - cmp tmp1, MIN_PAGE_SIZE - 16 - b.gt L(page_cross) + cmp tmp1, MIN_PAGE_SIZE - 32 + b.hi L(page_cross) + + /* Look for a NUL byte in the first 16 bytes. */ ldp data1, data2, [srcin] + mov zeroones, REP8_01 + #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul1/2 directly. @@ -94,114 +97,103 @@ ENTRY (__strlen_aarch64) bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(main_loop_entry) + b.eq L(bytes16_31) - /* Enter with C = has_nul1 == 0. */ + /* Find the exact offset of the first NUL byte in the first 16 bytes + from the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc mov len, 8 rev has_nul1, has_nul1 - clz tmp1, has_nul1 csel len, xzr, len, cc + clz tmp1, has_nul1 add len, len, tmp1, lsr 3 ret - /* The inner loop processes 32 bytes per iteration and uses the fast - NUL check. If we encounter non-ASCII characters, use a second - loop with the accurate NUL check. */ - .p2align 4 -L(main_loop_entry): - bic src, srcin, 15 - sub src, src, 16 -L(main_loop): - ldp data1, data2, [src, 32]! -L(page_cross_entry): - sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - bne 1f - ldp data1, data2, [src, 16] + .p2align 3 + /* Look for a NUL byte at offset 16..31 in the string. */ +L(bytes16_31): + ldp data1, data2, [srcin, 16] +#ifdef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif sub tmp1, data1, zeroones - sub tmp3, data2, zeroones - orr tmp2, tmp1, tmp3 - tst tmp2, zeroones, lsl 7 - beq L(main_loop) - add src, src, 16 -1: - /* The fast check failed, so do the slower, accurate NUL check. */ orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) + b.eq L(loop_entry) - /* Enter with C = has_nul1 == 0. */ -L(tail): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul1/2 directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - csel data1, data1, data2, cc - rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - bic has_nul1, tmp1, tmp2 -#else + /* Find the exact offset of the first NUL byte at offset 16..31 from + the string start. Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc -#endif - sub len, src, srcin + mov len, 24 rev has_nul1, has_nul1 - add tmp2, len, 8 + mov tmp3, 16 clz tmp1, has_nul1 - csel len, len, tmp2, cc + csel len, tmp3, len, cc add len, len, tmp1, lsr 3 ret -L(nonascii_loop): - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - bne L(tail) - ldp data1, data2, [src, 16]! - sub tmp1, data1, zeroones - orr tmp2, data1, REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, REP8_7f - bics has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - ccmp has_nul2, 0, 0, eq - beq L(nonascii_loop) - b L(tail) +L(loop_entry): + bic src, srcin, 31 - /* Load 16 bytes from [srcin & ~15] and force the bytes that precede - srcin to 0x7f, so we ignore any NUL bytes before the string. - Then continue in the aligned loop. */ -L(page_cross): - bic src, srcin, 15 - ldp data1, data2, [src] - lsl tmp1, srcin, 3 - mov tmp4, -1 + .p2align 5 +L(loop): + ldp dataq1, dataq2, [src, 32]! + uminp maskv.16b, datav1.16b, datav2.16b + uminp maskv.16b, maskv.16b, maskv.16b + cmeq maskv.8b, maskv.8b, 0 + fmov synd, maskd + cbz synd, L(loop) + + /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ + cmeq maskv.16b, datav1.16b, 0 + sub len, src, srcin + tst synd, 0xffffffff + b.ne 1f + cmeq maskv.16b, datav2.16b, 0 + add len, len, 16 +1: + /* Generate a bitmask and compute correct byte offset. */ #ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ + bic maskv.8h, 0xf0 #else - /* Little-endian. Early bytes are at LSB. */ - lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ + bic maskv.8h, 0x0f, lsl 8 #endif - orr tmp1, tmp1, REP8_80 - orn data1, data1, tmp1 - orn tmp2, data2, tmp1 - tst srcin, 8 - csel data1, data1, tmp4, eq - csel data2, data2, tmp2, eq - b L(page_cross_entry) + umaxp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd +#ifndef __AARCH64EB__ + rbit synd, synd +#endif + clz tmp, synd + add len, len, tmp, lsr 2 + ret -END (__strlen_aarch64) + .p2align 4 +L(page_cross): + bic src, srcin, 31 + mov tmpw, 0x0c03 + movk tmpw, 0xc030, lsl 16 + ld1 {datav1.16b, datav2.16b}, [src] + dup maskv.4s, tmpw + cmeq datav1.16b, datav1.16b, 0 + cmeq datav2.16b, datav2.16b, 0 + and datav1.16b, datav1.16b, maskv.16b + and datav2.16b, datav2.16b, maskv.16b + addp maskv.16b, datav1.16b, datav2.16b + addp maskv.16b, maskv.16b, maskv.16b + fmov synd, maskd + lsl shift, srcin, 1 + lsr synd, synd, shift + cbz synd, L(loop) + + rbit synd, synd + clz len, synd + lsr len, len, 1 + ret + +END (__strlen_aarch64) diff --git a/string/aarch64/strncmp-mte.S b/string/aarch64/strncmp-mte.S index b7e3914..46765d6 100644 --- a/string/aarch64/strncmp-mte.S +++ b/string/aarch64/strncmp-mte.S @@ -167,10 +167,10 @@ L(mutual_align): neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - and count, count, #0x3f LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, count + /* Adjust the limit and ensure it doesn't overflow. */ + adds limit, limit, count + csinv limit, limit, xzr, lo orr data1, data1, tmp2 orr data2, data2, tmp2 b L(start_realigned) diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S index aab78a2..4a34142 100644 --- a/string/arm/memcpy.S +++ b/string/arm/memcpy.S @@ -124,7 +124,7 @@ ENTRY (__memcpy_arm) mov dst, dstin /* Preserve dstin, we need to return it. */ cmp count, #64 - bge L(cpy_not_short) + bhs L(cpy_not_short) /* Deal with small copies quickly by dropping straight into the exit block. */ @@ -239,10 +239,10 @@ L(cpy_not_short): 1: subs tmp2, count, #64 /* Use tmp2 for count. */ - blt L(tail63aligned) + blo L(tail63aligned) cmp tmp2, #512 - bge L(cpy_body_long) + bhs L(cpy_body_long) L(cpy_body_medium): /* Count in tmp2. */ #ifdef USE_VFP @@ -266,7 +266,7 @@ L(cpy_body_medium): /* Count in tmp2. */ add src, src, #64 vstr d1, [dst, #56] add dst, dst, #64 - bge 1b + bhs 1b tst tmp2, #0x3f beq L(done) @@ -312,7 +312,7 @@ L(tail63aligned): /* Count in tmp2. */ ldrd A_l, A_h, [src, #64]! strd A_l, A_h, [dst, #64]! subs tmp2, tmp2, #64 - bge 1b + bhs 1b tst tmp2, #0x3f bne 1f ldr tmp2,[sp], #FRAME_SIZE @@ -383,7 +383,7 @@ L(cpy_body_long): /* Count in tmp2. */ add src, src, #32 subs tmp2, tmp2, #prefetch_lines * 64 * 2 - blt 2f + blo 2f 1: cpy_line_vfp d3, 0 cpy_line_vfp d4, 64 @@ -395,7 +395,7 @@ L(cpy_body_long): /* Count in tmp2. */ add dst, dst, #2 * 64 add src, src, #2 * 64 subs tmp2, tmp2, #prefetch_lines * 64 - bge 1b + bhs 1b 2: cpy_tail_vfp d3, 0 @@ -499,15 +499,15 @@ L(cpy_notaligned): 1: pld [src, #(3 * 64)] subs count, count, #64 - ldrmi tmp2, [sp], #FRAME_SIZE - bmi L(tail63unaligned) + ldrlo tmp2, [sp], #FRAME_SIZE + blo L(tail63unaligned) pld [src, #(4 * 64)] #ifdef USE_NEON vld1.8 {d0-d3}, [src]! vld1.8 {d4-d7}, [src]! subs count, count, #64 - bmi 2f + blo 2f 1: pld [src, #(4 * 64)] vst1.8 {d0-d3}, [ALIGN (dst, 64)]! @@ -515,7 +515,7 @@ L(cpy_notaligned): vst1.8 {d4-d7}, [ALIGN (dst, 64)]! vld1.8 {d4-d7}, [src]! subs count, count, #64 - bpl 1b + bhs 1b 2: vst1.8 {d0-d3}, [ALIGN (dst, 64)]! vst1.8 {d4-d7}, [ALIGN (dst, 64)]! diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c index 967507b..d5d4ea7 100644 --- a/string/bench/memcpy.c +++ b/string/bench/memcpy.c @@ -221,6 +221,40 @@ int main (void) printf ("\n"); } + printf ("\nUnaligned forwards memmove:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a, a + 256 + (i & 31), size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + + printf ("\nUnaligned backwards memmove:\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s (B/ns) ", funtab[f].name); + + for (int size = 1024; size <= 32768; size *= 2) + { + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a + 256 + (i & 31), a, size); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } return 0; } diff --git a/string/bench/strlen.c b/string/bench/strlen.c new file mode 100644 index 0000000..cc0f04b --- /dev/null +++ b/string/bench/strlen.c @@ -0,0 +1,221 @@ +/* + * strlen benchmark. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#define _GNU_SOURCE +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include "stringlib.h" +#include "benchlib.h" + +#define ITERS 2000 +#define ITERS2 20000000 +#define ITERS3 2000000 +#define NUM_STRLEN 16384 + +#define MAX_ALIGN 32 +#define MAX_STRLEN 256 + +static char a[(MAX_STRLEN + 1) * MAX_ALIGN] __attribute__((__aligned__(4096))); + +#define F(x, mte) {#x, x, mte}, + +static const struct fun +{ + const char *name; + size_t (*fun) (const char *s); + int test_mte; +} funtab[] = { + // clang-format off + F(strlen, 0) +#if __aarch64__ + F(__strlen_aarch64, 0) + F(__strlen_aarch64_mte, 1) +# if __ARM_FEATURE_SVE + F(__strlen_aarch64_sve, 1) +# endif +#elif __arm__ +# if __ARM_ARCH >= 6 && __ARM_ARCH_ISA_THUMB == 2 + F(__strlen_armv6t2, 0) +# endif +#endif + {0, 0, 0} + // clang-format on +}; +#undef F + +static uint16_t strlen_tests[NUM_STRLEN]; + +typedef struct { uint16_t size; uint16_t freq; } freq_data_t; +typedef struct { uint8_t align; uint16_t freq; } align_data_t; + +#define SIZE_NUM 65536 +#define SIZE_MASK (SIZE_NUM - 1) +static uint8_t strlen_len_arr[SIZE_NUM]; + +/* Frequency data for strlen sizes up to 128 based on SPEC2017. */ +static freq_data_t strlen_len_freq[] = +{ + { 12,22671}, { 18,12834}, { 13, 9555}, { 6, 6348}, { 17, 6095}, { 11, 2115}, + { 10, 1335}, { 7, 814}, { 2, 646}, { 9, 483}, { 8, 471}, { 16, 418}, + { 4, 390}, { 1, 388}, { 5, 233}, { 3, 204}, { 0, 79}, { 14, 79}, + { 15, 69}, { 26, 36}, { 22, 35}, { 31, 24}, { 32, 24}, { 19, 21}, + { 25, 17}, { 28, 15}, { 21, 14}, { 33, 14}, { 20, 13}, { 24, 9}, + { 29, 9}, { 30, 9}, { 23, 7}, { 34, 7}, { 27, 6}, { 44, 5}, + { 42, 4}, { 45, 3}, { 47, 3}, { 40, 2}, { 41, 2}, { 43, 2}, + { 58, 2}, { 78, 2}, { 36, 2}, { 48, 1}, { 52, 1}, { 60, 1}, + { 64, 1}, { 56, 1}, { 76, 1}, { 68, 1}, { 80, 1}, { 84, 1}, + { 72, 1}, { 86, 1}, { 35, 1}, { 39, 1}, { 50, 1}, { 38, 1}, + { 37, 1}, { 46, 1}, { 98, 1}, {102, 1}, {128, 1}, { 51, 1}, + {107, 1}, { 0, 0} +}; + +#define ALIGN_NUM 1024 +#define ALIGN_MASK (ALIGN_NUM - 1) +static uint8_t strlen_align_arr[ALIGN_NUM]; + +/* Alignment data for strlen based on SPEC2017. */ +static align_data_t string_align_freq[] = +{ + {8, 470}, {32, 427}, {16, 99}, {1, 19}, {2, 6}, {4, 3}, {0, 0} +}; + +static void +init_strlen_distribution (void) +{ + int i, j, freq, size, n; + + for (n = i = 0; (freq = strlen_len_freq[i].freq) != 0; i++) + for (j = 0, size = strlen_len_freq[i].size; j < freq; j++) + strlen_len_arr[n++] = size; + assert (n == SIZE_NUM); + + for (n = i = 0; (freq = string_align_freq[i].freq) != 0; i++) + for (j = 0, size = string_align_freq[i].align; j < freq; j++) + strlen_align_arr[n++] = size; + assert (n == ALIGN_NUM); +} + +static void +init_strlen_tests (void) +{ + uint16_t index[MAX_ALIGN]; + + memset (a, 'x', sizeof (a)); + + /* Create indices for strings at all alignments. */ + for (int i = 0; i < MAX_ALIGN; i++) + { + index[i] = i * (MAX_STRLEN + 1); + a[index[i] + MAX_STRLEN] = 0; + } + + /* Create a random set of strlen input strings using the string length + and alignment distributions. */ + for (int n = 0; n < NUM_STRLEN; n++) + { + int align = strlen_align_arr[rand32 (0) & ALIGN_MASK]; + int exp_len = strlen_len_arr[rand32 (0) & SIZE_MASK]; + + strlen_tests[n] = + index[(align + exp_len) & (MAX_ALIGN - 1)] + MAX_STRLEN - exp_len; + } +} + +static volatile size_t maskv = 0; + +int main (void) +{ + rand32 (0x12345678); + init_strlen_distribution (); + init_strlen_tests (); + + printf ("\nRandom strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + size_t res = 0, strlen_size = 0, mask = maskv; + printf ("%22s ", funtab[f].name); + + for (int c = 0; c < NUM_STRLEN; c++) + strlen_size += funtab[f].fun (a + strlen_tests[c]); + strlen_size *= ITERS; + + /* Measure latency of strlen result with (res & mask). */ + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS; i++) + for (int c = 0; c < NUM_STRLEN; c++) + res = funtab[f].fun (a + strlen_tests[c] + (res & mask)); + t = clock_get_ns () - t; + printf ("%.2f\n", (double)strlen_size / t); + } + + printf ("\nSmall aligned strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 1; size <= 64; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nSmall unaligned strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + int align = 9; + for (int size = 1; size <= 64; size *= 2) + { + memset (a + align, 'x', size); + a[align + size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS2; i++) + funtab[f].fun (a + align); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS2 / t); + } + printf ("\n"); + } + + printf ("\nMedium strlen (bytes/ns):\n"); + for (int f = 0; funtab[f].name != 0; f++) + { + printf ("%22s ", funtab[f].name); + + for (int size = 128; size <= 4096; size *= 2) + { + memset (a, 'x', size); + a[size - 1] = 0; + + uint64_t t = clock_get_ns (); + for (int i = 0; i < ITERS3; i++) + funtab[f].fun (a); + t = clock_get_ns () - t; + printf ("%d%c: %.2f ", size < 1024 ? size : size / 1024, + size < 1024 ? 'B' : 'K', (double)size * ITERS3 / t); + } + printf ("\n"); + } + + printf ("\n"); + + return 0; +} |