diff options
author | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2020-05-12 11:58:30 +0100 |
---|---|---|
committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2020-05-12 17:02:15 +0100 |
commit | 6b23ea83a64195e0c22f159c138a912cd2818f30 (patch) | |
tree | cf15ece47fa8501ec9721b7c1b2adbbfb03d46d9 /string | |
parent | 0b7d1aeb9db4ac6791a030d2bb841d8ea9c665e5 (diff) | |
download | arm-optimized-routines-6b23ea83a64195e0c22f159c138a912cd2818f30.tar.gz |
string: Improve strnlen performance
Improve strnlen performance by using a much simpler SIMD implementation.
On modern micro architectures the speedup is 2.3x on large strings and
1.5x on small strings.
Diffstat (limited to 'string')
-rw-r--r-- | string/aarch64/strnlen.S | 218 |
1 files changed, 86 insertions, 132 deletions
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index 202c401..2543175 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -1,156 +1,110 @@ /* * strnlen - calculate the length of a string with limit. * - * Copyright (c) 2013, Arm Limited. + * Copyright (c) 2020, Arm Limited. * SPDX-License-Identifier: MIT */ /* Assumptions: * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, Advanced SIMD. + * MTE compatible. */ #include "../asmdefs.h" -/* Arguments and results. */ #define srcin x0 -#define len x0 -#define limit x1 +#define cntin x1 +#define result x0 -/* Locals and temporaries. */ #define src x2 -#define data1 x3 -#define data2 x4 -#define data2a x5 -#define has_nul1 x6 -#define has_nul2 x7 -#define tmp1 x8 -#define tmp2 x9 -#define tmp3 x10 -#define tmp4 x11 -#define zeroones x12 -#define pos x13 -#define limit_wd x14 +#define synd x3 +#define shift x4 +#define wtmp w4 +#define tmp x4 +#define cntrem x5 + +#define qdata q0 +#define vdata v0 +#define vhas_chr v1 +#define vrepmask v2 +#define vend v3 +#define dend d3 -#define REP8_01 0x0101010101010101 -#define REP8_7f 0x7f7f7f7f7f7f7f7f -#define REP8_80 0x8080808080808080 - - .text - .p2align 6 -L(start): - /* Pre-pad to ensure critical loop begins an icache line. */ - .rep 6 - nop - .endr - /* Put this code here to avoid wasting more space with pre-padding. */ -L(hit_limit): - mov len, limit +/* + Core algorithm: + + For each 16-byte chunk we calculate a 64-bit syndrome value with four bits + per byte. For even bytes, bits 0-3 are set if the relevant byte matched the + requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are + set likewise for odd bytes so that adjacent bytes can be merged. Since the + bits in the syndrome reflect the order in which things occur in the original + string, counting trailing zeros identifies exactly which byte matched. */ + +ENTRY (__strnlen_aarch64) + bic src, srcin, 15 + mov wtmp, 0xf00f + cbz cntin, L(nomatch) + ld1 {vdata.16b}, [src], 16 + dup vrepmask.8h, wtmp + cmeq vhas_chr.16b, vdata.16b, 0 + lsl shift, srcin, 2 + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + lsr synd, synd, shift + cbz synd, L(start_loop) +L(finish): + rbit synd, synd + clz synd, synd + lsr result, synd, 2 + cmp cntin, result + csel result, cntin, result, ls ret -ENTRY_ALIGN (__strnlen_aarch64, 0) - cbz limit, L(hit_limit) - mov zeroones, #REP8_01 - bic src, srcin, #15 - ands tmp1, srcin, #15 - b.ne L(misaligned) - /* Calculate the number of full and partial words -1. */ - sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ - - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - can be done in parallel across the entire word. */ - /* The inner loop deals with two Dwords at a time. This has a - slightly higher start-up cost, but we should win quite quickly, - especially on cores with a high number of issue slots per - cycle, as we get much better parallelism out of the operations. */ - - /* Start of critial section -- keep to one 64Byte cache line. */ -L(loop): - ldp data1, data2, [src], #16 -L(realigned): - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bic has_nul2, tmp3, tmp4 - subs limit_wd, limit_wd, #1 - orr tmp1, has_nul1, has_nul2 - ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ - b.eq L(loop) - /* End of critical section -- keep to one 64Byte cache line. */ - - orr tmp1, has_nul1, has_nul2 - cbz tmp1, L(hit_limit) /* No null in final Qword. */ - - /* We know there's a null in the final Qword. The easiest thing - to do now is work out the length of the string and return - MIN (len, limit). */ - - sub len, src, srcin - cbz has_nul1, L(nul_in_data2) -#ifdef __AARCH64EB__ - mov data2, data1 -#endif - sub len, len, #8 - mov has_nul2, has_nul1 -L(nul_in_data2): -#ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ - rev data2, data2 - sub tmp1, data2, zeroones - orr tmp2, data2, #REP8_7f - bic has_nul2, tmp1, tmp2 +L(start_loop): + sub tmp, src, srcin + subs cntrem, cntin, tmp + b.ls L(nomatch) + + /* Make sure that it won't overread by a 16-byte chunk */ + add tmp, cntrem, 15 + tbnz tmp, 4, L(loop32_2) + + .p2align 5 +L(loop32): + ldr qdata, [src], 16 + cmeq vhas_chr.16b, vdata.16b, 0 + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbnz synd, L(end) +L(loop32_2): + ldr qdata, [src], 16 + subs cntrem, cntrem, 32 + cmeq vhas_chr.16b, vdata.16b, 0 + b.ls L(end) + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + fmov synd, dend + cbz synd, L(loop32) + +L(end): + and vhas_chr.16b, vhas_chr.16b, vrepmask.16b + addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */ + sub src, src, 16 + mov synd, vend.d[0] + sub result, src, srcin +#ifndef __AARCH64EB__ + rbit synd, synd #endif - sub len, len, #8 - rev has_nul2, has_nul2 - clz pos, has_nul2 - add len, len, pos, lsr #3 /* Bits to bytes. */ - cmp len, limit - csel len, len, limit, ls /* Return the lower value. */ + clz synd, synd + add result, result, synd, lsr 2 + cmp cntin, result + csel result, cntin, result, ls ret -L(misaligned): - /* Deal with a partial first word. - We're doing two things in parallel here; - 1) Calculate the number of words (but avoiding overflow if - limit is near ULONG_MAX) - to do this we need to work out - limit + tmp1 - 1 as a 65-bit value before shifting it; - 2) Load and mask the initial data words - we force the bytes - before the ones we are interested in to 0xff - this ensures - early bytes will not hit any zero detection. */ - sub limit_wd, limit, #1 - neg tmp4, tmp1 - cmp tmp1, #8 - - and tmp3, limit_wd, #15 - lsr limit_wd, limit_wd, #4 - mov tmp2, #~0 - - ldp data1, data2, [src], #16 - lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ - add tmp3, tmp3, tmp1 - -#ifdef __AARCH64EB__ - /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#else - /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ -#endif - add limit_wd, limit_wd, tmp3, lsr #4 - - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - - csinv data1, data1, xzr, le - csel data2, data2, data2a, le - b L(realigned) +L(nomatch): + mov result, cntin + ret END (__strnlen_aarch64) |