diff options
32 files changed, 598 insertions, 356 deletions
@@ -9,11 +9,11 @@ third_party { type: GIT value: "https://github.com/ARM-software/optimized-routines.git" } - version: "3377796fe24ff1d5396609205426402678208eb1" + version: "33ba19089a261964e1e84ba4edf90263b468c161" license_type: NOTICE last_upgrade_date { - year: 2019 - month: 12 - day: 26 + year: 2020 + month: 2 + day: 1 } } @@ -3,8 +3,10 @@ Arm Optimized Routines This repository contains implementations of library functions provided by Arm under MIT License (See LICENSE). Contributions -to this project are accepted, but the terms will need negotiation (so -relicensing and copyright assignment to the FSF is possible later). +to this project are accepted, but Contributors have to sign an +Assignment Agreement, please follow the instructions in +contributor-agreement.pdf. This is needed so upstreaming code +to projects that require copyright assignment is possible. Regular quarterly releases are tagged as vYY.MM (e.g. v19.11). diff --git a/contributor-agreement.pdf b/contributor-agreement.pdf Binary files differnew file mode 100644 index 0000000..f42c3ae --- /dev/null +++ b/contributor-agreement.pdf diff --git a/math/include/mathlib.h b/math/include/mathlib.h index 254954a..4493008 100644 --- a/math/include/mathlib.h +++ b/math/include/mathlib.h @@ -36,6 +36,7 @@ double __s_sin (double); double __s_cos (double); double __s_exp (double); double __s_log (double); +double __s_pow (double, double); #if __aarch64__ #if __GNUC__ >= 5 @@ -61,6 +62,7 @@ __f64x2_t __v_sin (__f64x2_t); __f64x2_t __v_cos (__f64x2_t); __f64x2_t __v_exp (__f64x2_t); __f64x2_t __v_log (__f64x2_t); +__f64x2_t __v_pow (__f64x2_t, __f64x2_t); #if __GNUC__ >= 9 || __clang_major__ >= 8 #define __vpcs __attribute__((__aarch64_vector_pcs__)) @@ -78,6 +80,7 @@ __vpcs __f64x2_t __vn_sin (__f64x2_t); __vpcs __f64x2_t __vn_cos (__f64x2_t); __vpcs __f64x2_t __vn_exp (__f64x2_t); __vpcs __f64x2_t __vn_log (__f64x2_t); +__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t); /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); @@ -90,6 +93,7 @@ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); #endif #endif @@ -347,7 +347,9 @@ pow (double x, double y) if (topx == 0) { /* Normalize subnormal x so exponent becomes negative. */ - ix = asuint64 (x * 0x1p52); + /* Without the barrier some versions of clang evalutate the mul + unconditionally causing spurious overflow exceptions. */ + ix = asuint64 (opt_barrier_double (x) * 0x1p52); ix &= 0x7fffffffffffffff; ix -= 52ULL << 52; } diff --git a/math/s_pow.c b/math/s_pow.c new file mode 100644 index 0000000..2e34c9f --- /dev/null +++ b/math/s_pow.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#define SCALAR 1 +#include "v_pow.c" diff --git a/math/test/mathbench.c b/math/test/mathbench.c index 8d3ff1d..33ceda3 100644 --- a/math/test/mathbench.c +++ b/math/test/mathbench.c @@ -128,6 +128,18 @@ xy_Z_powf (v_float x) { return _ZGVnN4vv_powf (x, x); } + +__vpcs static v_double +xy__vn_pow (v_double x) +{ + return __vn_pow (x, x); +} + +__vpcs static v_double +xy_Z_pow (v_double x) +{ + return _ZGVnN2vv_pow (x, x); +} #endif static v_float @@ -135,6 +147,12 @@ xy__v_powf (v_float x) { return __v_powf (x, x); } + +static v_double +xy__v_pow (v_double x) +{ + return __v_pow (x, x); +} #endif static float @@ -142,6 +160,12 @@ xy__s_powf (float x) { return __s_powf (x, x); } + +static double +xy__s_pow (double x) +{ + return __s_pow (x, x); +} #endif static double @@ -256,6 +280,7 @@ D (__s_sin, -3.1, 3.1) D (__s_cos, -3.1, 3.1) D (__s_exp, -9.9, 9.9) D (__s_log, 0.01, 11.1) +{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, F (__s_expf, -9.9, 9.9) F (__s_expf_1u, -9.9, 9.9) F (__s_exp2f, -9.9, 9.9) @@ -270,6 +295,7 @@ VD (__v_sin, -3.1, 3.1) VD (__v_cos, -3.1, 3.1) VD (__v_exp, -9.9, 9.9) VD (__v_log, 0.01, 11.1) +{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, VF (__v_dummyf, 1.0, 2.0) VF (__v_expf, -9.9, 9.9) VF (__v_expf_1u, -9.9, 9.9) @@ -285,6 +311,8 @@ VND (__vn_exp, -9.9, 9.9) VND (_ZGVnN2v_exp, -9.9, 9.9) VND (__vn_log, 0.01, 11.1) VND (_ZGVnN2v_log, 0.01, 11.1) +{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, +{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, VND (__vn_sin, -3.1, 3.1) VND (_ZGVnN2v_sin, -3.1, 3.1) VND (__vn_cos, -3.1, 3.1) diff --git a/math/test/runulp.sh b/math/test/runulp.sh index 44393b8..a8c391b 100755 --- a/math/test/runulp.sh +++ b/math/test/runulp.sh @@ -45,6 +45,16 @@ t exp2 -0x1p-6 -0x1p6 40000 t exp2 633.3 733.3 10000 t exp2 -633.3 -777.3 10000 +L=0.02 +t log 0 0xffff000000000000 10000 +t log 0x1p-4 0x1p4 40000 +t log 0 inf 40000 + +L=0.05 +t log2 0 0xffff000000000000 10000 +t log2 0x1p-4 0x1p4 40000 +t log2 0 inf 40000 + L=0.05 t pow 0.5 2.0 x 0 inf 20000 t pow -0.5 -2.0 x 0 inf 20000 @@ -72,6 +82,16 @@ t exp2f 0 0xffff0000 10000 t exp2f 0x1p-14 0x1p8 50000 t exp2f -0x1p-14 -0x1p8 50000 +L=0.32 +t logf 0 0xffff0000 10000 +t logf 0x1p-4 0x1p4 50000 +t logf 0 inf 50000 + +L=0.26 +t log2f 0 0xffff0000 10000 +t log2f 0x1p-4 0x1p4 50000 +t log2f 0 inf 50000 + L=0.06 t sinf 0 0xffff0000 10000 t sinf 0x1p-14 0x1p54 50000 @@ -82,6 +102,16 @@ t cosf 0 0xffff0000 10000 t cosf 0x1p-14 0x1p54 50000 t cosf -0x1p-14 -0x1p54 50000 +L=0.06 +t sincosf_sinf 0 0xffff0000 10000 +t sincosf_sinf 0x1p-14 0x1p54 50000 +t sincosf_sinf -0x1p-14 -0x1p54 50000 + +L=0.06 +t sincosf_cosf 0 0xffff0000 10000 +t sincosf_cosf 0x1p-14 0x1p54 50000 +t sincosf_cosf -0x1p-14 -0x1p54 50000 + L=0.4 t powf 0x1p-1 0x1p1 x 0x1p-7 0x1p7 50000 t powf 0x1p-1 0x1p1 x -0x1p-7 -0x1p7 50000 @@ -110,6 +140,28 @@ range_exp=' -633.3 -777.3 10000 ' +range_log=' + 0 0xffff000000000000 10000 + 0x1p-4 0x1p4 400000 + 0 inf 400000 +' + +range_pow=' + 0x1p-1 0x1p1 x 0x1p-10 0x1p10 50000 + 0x1p-1 0x1p1 x -0x1p-10 -0x1p10 50000 + 0x1p-500 0x1p500 x 0x1p-1 0x1p1 50000 + 0x1p-500 0x1p500 x -0x1p-1 -0x1p1 50000 + 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p16 50000 + 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p16 50000 +' + +range_sin=' + 0 0xffff000000000000 10000 + 0x1p-4 0x1p4 400000 + -0x1p-23 0x1p23 400000 +' +range_cos="$range_sin" + range_expf=' 0 0xffff0000 10000 0x1p-14 0x1p8 500000 @@ -143,6 +195,10 @@ range_powf=' # error limits L_exp=1.9 +L_log=1.2 +L_pow=0.05 +L_sin=3.0 +L_cos=3.0 L_expf=1.49 L_expf_1u=0.4 L_exp2f=1.49 @@ -173,6 +229,26 @@ exp __v_exp $runv exp __vn_exp $runvn exp _ZGVnN2v_exp $runvn +log __s_log $runs +log __v_log $runv +log __vn_log $runvn +log _ZGVnN2v_log $runvn + +pow __s_pow $runs +pow __v_pow $runv +pow __vn_pow $runvn +pow _ZGVnN2vv_pow $runvn + +sin __s_sin $runs +sin __v_sin $runv +sin __vn_sin $runvn +sin _ZGVnN2v_sin $runvn + +cos __s_cos $runs +cos __v_cos $runv +cos __vn_cos $runvn +cos _ZGVnN2v_cos $runvn + expf __s_expf $runs expf __v_expf $runv expf __vn_expf $runvn diff --git a/math/test/ulp.c b/math/test/ulp.c index b746080..371567a 100644 --- a/math/test/ulp.c +++ b/math/test/ulp.c @@ -214,6 +214,16 @@ struct conf double errlim; }; +/* Wrappers for sincos. */ +static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} +static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} +static double sincos_sin(double x) {(void)cos(x); return sin(x);} +static double sincos_cos(double x) {(void)sin(x); return cos(x);} +#if USE_MPFR +static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } +static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } +#endif + /* A bit of a hack: call vector functions twice with the same input in lane 0 but a different value in other lanes: once with an in-range value and then with a special case value. */ @@ -240,6 +250,7 @@ static double v_sin(double x) { return __v_sin(argd(x))[0]; } static double v_cos(double x) { return __v_cos(argd(x))[0]; } static double v_exp(double x) { return __v_exp(argd(x))[0]; } static double v_log(double x) { return __v_log(argd(x))[0]; } +static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; } #ifdef __vpcs static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; } static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; } @@ -253,6 +264,7 @@ static double vn_sin(double x) { return __vn_sin(argd(x))[0]; } static double vn_cos(double x) { return __vn_cos(argd(x))[0]; } static double vn_exp(double x) { return __vn_exp(argd(x))[0]; } static double vn_log(double x) { return __vn_log(argd(x))[0]; } +static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; } static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } @@ -263,6 +275,7 @@ static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; } static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; } static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } +static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } #endif #endif @@ -311,6 +324,8 @@ static const struct fun fun[] = { #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0) F1 (sin) F1 (cos) + F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0) + F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0) F1 (exp) F1 (exp2) F1 (log) @@ -334,6 +349,7 @@ static const struct fun fun[] = { F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0) F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0) F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0) + F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0) #if __aarch64__ F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1) F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1) @@ -347,6 +363,7 @@ static const struct fun fun[] = { F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1) F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1) F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1) + F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1) #ifdef __vpcs F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1) F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1) @@ -360,6 +377,7 @@ static const struct fun fun[] = { F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1) F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1) F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1) + F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1) F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) @@ -370,6 +388,7 @@ static const struct fun fun[] = { F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1) F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1) + F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1) #endif #endif #endif diff --git a/math/v_math.h b/math/v_math.h index 0861e98..3db22e5 100644 --- a/math/v_math.h +++ b/math/v_math.h @@ -249,6 +249,11 @@ v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, return f (x1, x2); } +static inline int +v_lanes64 (void) +{ + return 1; +} static inline v_f64_t v_f64 (f64_t x) { @@ -264,6 +269,16 @@ v_s64 (s64_t x) { return x; } +static inline f64_t +v_get_f64 (v_f64_t x, int i) +{ + return x; +} +static inline void +v_set_f64 (v_f64_t *x, int i, f64_t v) +{ + *x = v; +} /* true if any elements of a v_cond result is non-zero. */ static inline int v_any_u64 (v_u64_t x) @@ -506,6 +521,11 @@ v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; } +static inline int +v_lanes64 (void) +{ + return 2; +} static inline v_f64_t v_f64 (f64_t x) { @@ -521,6 +541,16 @@ v_s64 (s64_t x) { return (v_s64_t){x, x}; } +static inline f64_t +v_get_f64 (v_f64_t x, int i) +{ + return x[i]; +} +static inline void +v_set_f64 (v_f64_t *x, int i, f64_t v) +{ + (*x)[i] = v; +} /* true if any elements of a v_cond result is non-zero. */ static inline int v_any_u64 (v_u64_t x) diff --git a/math/v_pow.c b/math/v_pow.c new file mode 100644 index 0000000..a209d57 --- /dev/null +++ b/math/v_pow.c @@ -0,0 +1,27 @@ +/* + * Double-precision vector pow function. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "mathlib.h" +#include "v_math.h" +#if V_SUPPORTED + +VPCS_ATTR +v_f64_t +V_NAME(pow) (v_f64_t x, v_f64_t y) +{ + v_f64_t z; + for (int lane = 0; lane < v_lanes64 (); lane++) + { + f64_t sx = v_get_f64 (x, lane); + f64_t sy = v_get_f64 (y, lane); + f64_t sz = pow (sx, sy); + v_set_f64 (&z, lane, sz); + } + return z; +} +VPCS_ALIAS +#endif diff --git a/math/vn_pow.c b/math/vn_pow.c new file mode 100644 index 0000000..2609501 --- /dev/null +++ b/math/vn_pow.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_pow. + * + * Copyright (c) 2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow) +#include "v_pow.c" +#endif diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S index aff6e3d..10be49e 100644 --- a/string/aarch64/memchr.S +++ b/string/aarch64/memchr.S @@ -11,6 +11,8 @@ * Neon Available. */ +#include "../asmdefs.h" + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -44,17 +46,9 @@ * identify exactly which byte has matched. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn __memchr_aarch64 +ENTRY (__memchr_aarch64) /* Do not dereference srcin if no bytes to compare. */ - cbz cntin, .Lzero_length + cbz cntin, L(zero_length) /* * Magic constant 0x40100401 allows us to identify which lane matches * the requested byte. @@ -67,7 +61,7 @@ def_fn __memchr_aarch64 dup vrepmask.4s, wtmp2 ands soff, srcin, #31 and cntrem, cntin, #31 - b.eq .Lloop + b.eq L(loop) /* * Input string is not 32-byte aligned. We calculate the syndrome @@ -90,25 +84,25 @@ def_fn __memchr_aarch64 lsr synd, synd, tmp lsl synd, synd, tmp /* The first block can also be the last */ - b.ls .Lmasklast + b.ls L(masklast) /* Have we found something already? */ - cbnz synd, .Ltail + cbnz synd, L(tail) -.Lloop: +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 subs cntin, cntin, #32 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b /* If we're out of data we finish regardless of the result */ - b.ls .Lend + b.ls L(end) /* Use a fast check for the termination condition */ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b addp vend.2d, vend.2d, vend.2d mov synd, vend.d[0] /* We're not out of data, loop if we haven't found the character */ - cbz synd, .Lloop + cbz synd, L(loop) -.Lend: +L(end): /* Termination condition found, let's calculate the syndrome value */ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b @@ -116,9 +110,9 @@ def_fn __memchr_aarch64 addp vend.16b, vend.16b, vend.16b /* 128->64 */ mov synd, vend.d[0] /* Only do the clear for the last possible block */ - b.hi .Ltail + b.hi L(tail) -.Lmasklast: +L(masklast): /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ add tmp, cntrem, soff and tmp, tmp, #31 @@ -127,7 +121,7 @@ def_fn __memchr_aarch64 lsl synd, synd, tmp lsr synd, synd, tmp -.Ltail: +L(tail): /* Count the trailing zeros using bit reversing */ rbit synd, synd /* Compensate the last post-increment */ @@ -142,8 +136,8 @@ def_fn __memchr_aarch64 csel result, xzr, result, eq ret -.Lzero_length: +L(zero_length): mov result, #0 ret - .size __memchr_aarch64, . - __memchr_aarch64 +END (__memchr_aarch64) diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S index 72a66bc..6722516 100644 --- a/string/aarch64/memcmp.S +++ b/string/aarch64/memcmp.S @@ -9,7 +9,7 @@ * ARMv8-a, AArch64, unaligned accesses. */ -#define L(l) .L ## l +#include "../asmdefs.h" /* Parameters and result. */ #define src1 x0 @@ -27,15 +27,7 @@ #define tmp1 x7 #define tmp2 x8 - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn __memcmp_aarch64 p2align=6 +ENTRY (__memcmp_aarch64) subs limit, limit, 8 b.lo L(less8) @@ -138,4 +130,4 @@ L(byte_loop): sub result, data1w, data2w ret - .size __memcmp_aarch64, . - __memcmp_aarch64 +END (__memcmp_aarch64) diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S index aef22e9..3868141 100644 --- a/string/aarch64/memset.S +++ b/string/aarch64/memset.S @@ -11,6 +11,7 @@ * */ +#include "../asmdefs.h" #define dstin x0 #define val x1 @@ -25,17 +26,7 @@ #define zva_len x7 #define zva_lenw w7 -#define L(l) .L ## l - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn __memset_aarch64 p2align=6 +ENTRY (__memset_aarch64) dup v0.16B, valw add dstend, dstin, count @@ -185,4 +176,4 @@ L(zva_other): 4: add count, count, zva_len b L(tail64) - .size __memset_aarch64, . - __memset_aarch64 +END (__memset_aarch64) diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S index cdb38aa..00d9be3 100644 --- a/string/aarch64/strchr.S +++ b/string/aarch64/strchr.S @@ -11,6 +11,8 @@ * Neon Available. */ +#include "../asmdefs.h" + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -48,15 +50,7 @@ /* Locals and temporaries. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn __strchr_aarch64 +ENTRY (__strchr_aarch64) /* Magic constant 0x40100401 to allow us to identify which lane matches the requested byte. Magic constant 0x80200802 used similarly for NUL termination. */ @@ -67,7 +61,7 @@ def_fn __strchr_aarch64 dup vrepmask_c.4s, wtmp2 ands tmp1, srcin, #31 add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ - b.eq .Lloop + b.eq L(loop) /* Input string is not 32-byte aligned. Rather than forcing the padding bytes to a safe value, we calculate the syndrome @@ -93,9 +87,9 @@ def_fn __strchr_aarch64 mov tmp3, vend1.d[0] bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail + cbnz tmp1, L(tail) -.Lloop: +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b @@ -107,7 +101,7 @@ def_fn __strchr_aarch64 orr vend1.16b, vend1.16b, vend2.16b addp vend1.2d, vend1.2d, vend1.2d mov tmp1, vend1.d[0] - cbz tmp1, .Lloop + cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ @@ -121,7 +115,7 @@ def_fn __strchr_aarch64 addp vend1.16b, vend1.16b, vend2.16b // 128->64 mov tmp1, vend1.d[0] -.Ltail: +L(tail): /* Count the trailing zeros, by bit reversing... */ rbit tmp1, tmp1 /* Re-bias source. */ @@ -134,4 +128,4 @@ def_fn __strchr_aarch64 csel result, result, xzr, eq ret - .size __strchr_aarch64, . - __strchr_aarch64 +END (__strchr_aarch64) diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S index 4aee293..81264ea 100644 --- a/string/aarch64/strchrnul.S +++ b/string/aarch64/strchrnul.S @@ -11,6 +11,8 @@ * Neon Available. */ +#include "../asmdefs.h" + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -44,15 +46,7 @@ /* Locals and temporaries. */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn __strchrnul_aarch64 +ENTRY (__strchrnul_aarch64) /* Magic constant 0x40100401 to allow us to identify which lane matches the termination condition. */ mov wtmp2, #0x0401 @@ -61,7 +55,7 @@ def_fn __strchrnul_aarch64 bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ dup vrepmask.4s, wtmp2 ands tmp1, srcin, #31 - b.eq .Lloop + b.eq L(loop) /* Input string is not 32-byte aligned. Rather than forcing the padding bytes to a safe value, we calculate the syndrome @@ -85,9 +79,9 @@ def_fn __strchrnul_aarch64 mov tmp3, vend1.d[0] bic tmp1, tmp3, tmp1 // Mask padding bits. - cbnz tmp1, .Ltail + cbnz tmp1, L(tail) -.Lloop: +L(loop): ld1 {vdata1.16b, vdata2.16b}, [src], #32 cmeq vhas_nul1.16b, vdata1.16b, #0 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b @@ -99,7 +93,7 @@ def_fn __strchrnul_aarch64 orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b addp vend1.2d, vend1.2d, vend1.2d mov tmp1, vend1.d[0] - cbz tmp1, .Lloop + cbz tmp1, L(loop) /* Termination condition found. Now need to establish exactly why we terminated. */ @@ -109,7 +103,7 @@ def_fn __strchrnul_aarch64 addp vend1.16b, vend1.16b, vend1.16b // 128->64 mov tmp1, vend1.d[0] -.Ltail: +L(tail): /* Count the trailing zeros, by bit reversing... */ rbit tmp1, tmp1 /* Re-bias source. */ @@ -119,4 +113,4 @@ def_fn __strchrnul_aarch64 add result, src, tmp1, lsr #1 ret - .size __strchrnul_aarch64, . - __strchrnul_aarch64 +END (__strchrnul_aarch64) diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S index 2aa367c..65af5ce 100644 --- a/string/aarch64/strcmp.S +++ b/string/aarch64/strcmp.S @@ -10,15 +10,7 @@ * ARMv8-a, AArch64 */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -#define L(label) .L ## label +#include "../asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f @@ -44,7 +36,7 @@ #define pos x11 /* Start of performance-critical section -- one 64B cache line. */ -def_fn __strcmp_aarch64 p2align=6 +ENTRY (__strcmp_aarch64) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 @@ -174,4 +166,5 @@ L(loop_misaligned): L(done): sub result, data1, data2 ret - .size __strcmp_aarch64, .-__strcmp_aarch64 + +END (__strcmp_aarch64) diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S index 4e10b4d..4edffcf 100644 --- a/string/aarch64/strcpy.S +++ b/string/aarch64/strcpy.S @@ -10,6 +10,8 @@ * ARMv8-a, AArch64, unaligned accesses, min page size 4k. */ +#include "../asmdefs.h" + /* To build as stpcpy, define BUILD_STPCPY before compiling this file. To test the page crossing code path more thoroughly, compile with @@ -46,14 +48,6 @@ #define STRCPY __strcpy_aarch64 #endif - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ @@ -85,7 +79,7 @@ #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) -def_fn STRCPY p2align=6 +ENTRY (STRCPY) /* For moderately short strings, the fastest way to do the copy is to calculate the length of the string in the same way as strlen, then essentially do a memcpy of the result. This avoids the need for @@ -105,9 +99,9 @@ def_fn STRCPY p2align=6 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte aligned string will never fail the page align check, so will always take the fast path. */ - b.gt .Lpage_cross + b.gt L(page_cross) -.Lpage_cross_ok: +L(page_cross_ok): ldp data1, data2, [srcin] #ifdef __AARCH64EB__ /* Because we expect the end to be found within 16 characters @@ -119,7 +113,7 @@ def_fn STRCPY p2align=6 sub tmp1, tmp2, zeroones orr tmp2, tmp2, #REP8_7f bics has_nul1, tmp1, tmp2 - b.ne .Lfp_le8 + b.ne L(fp_le8) rev tmp4, data2 sub tmp3, tmp4, zeroones orr tmp4, tmp4, #REP8_7f @@ -127,17 +121,17 @@ def_fn STRCPY p2align=6 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f bics has_nul1, tmp1, tmp2 - b.ne .Lfp_le8 + b.ne L(fp_le8) sub tmp3, data2, zeroones orr tmp4, data2, #REP8_7f #endif bics has_nul2, tmp3, tmp4 - b.eq .Lbulk_entry + b.eq L(bulk_entry) /* The string is short (<=16 bytes). We don't know exactly how short though, yet. Work out the exact length so that we can quickly select the optimal copy strategy. */ -.Lfp_gt8: +L(fp_gt8): rev has_nul2, has_nul2 clz pos, has_nul2 mov tmp2, #56 @@ -155,12 +149,12 @@ def_fn STRCPY p2align=6 #endif ret -.Lfp_le8: +L(fp_le8): rev has_nul1, has_nul1 clz pos, has_nul1 add dst, dstin, pos, lsr #3 /* Bits to bytes. */ subs tmp2, pos, #24 /* Pos in bits. */ - b.lt .Lfp_lt4 + b.lt L(fp_lt4) #ifdef __AARCH64EB__ mov tmp2, #56 sub pos, tmp2, pos @@ -176,15 +170,15 @@ def_fn STRCPY p2align=6 mov dstin, dst #endif ret -.Lfp_lt4: - cbz pos, .Lfp_lt2 +L(fp_lt4): + cbz pos, L(fp_lt2) /* 2->3 bytes to copy. */ #ifdef __AARCH64EB__ lsr data1, data1, #48 #endif strh data1w, [dstin] /* Fall-through, one byte (max) to go. */ -.Lfp_lt2: +L(fp_lt2): /* Null-terminated string. Last character must be zero! */ strb wzr, [dst] #ifdef BUILD_STPCPY @@ -195,20 +189,20 @@ def_fn STRCPY p2align=6 .p2align 6 /* Aligning here ensures that the entry code and main loop all lies within one 64-byte cache line. */ -.Lbulk_entry: +L(bulk_entry): sub to_align, to_align, #16 stp data1, data2, [dstin] sub src, srcin, to_align sub dst, dstin, to_align - b .Lentry_no_page_cross + b L(entry_no_page_cross) /* The inner loop deals with two Dwords at a time. This has a slightly higher start-up cost, but we should win quite quickly, especially on cores with a high number of issue slots per cycle, as we get much better parallelism out of the operations. */ -.Lmain_loop: +L(main_loop): stp data1, data2, [dst], #16 -.Lentry_no_page_cross: +L(entry_no_page_cross): ldp data1, data2, [src], #16 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f @@ -217,7 +211,7 @@ def_fn STRCPY p2align=6 bic has_nul1, tmp1, tmp2 bics has_nul2, tmp3, tmp4 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lmain_loop + b.eq L(main_loop) /* Since we know we are copying at least 16 bytes, the fastest way to deal with the tail is to determine the location of the @@ -250,7 +244,7 @@ def_fn STRCPY p2align=6 #endif ret -.Lpage_cross: +L(page_cross): bic src, srcin, #15 /* Start by loading two words at [srcin & ~15], then forcing the bytes that precede srcin to 0xff. This means they never look @@ -276,7 +270,7 @@ def_fn STRCPY p2align=6 bic has_nul1, tmp1, tmp2 bics has_nul2, tmp3, tmp4 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lpage_cross_ok + b.eq L(page_cross_ok) /* We now need to make data1 and data2 look like they've been loaded directly from srcin. Do a rotate on the 128-bit value. */ lsl tmp1, to_align, #3 /* Bytes->bits. */ @@ -307,8 +301,8 @@ def_fn STRCPY p2align=6 orr tmp4, data2, #REP8_7f #endif bic has_nul1, tmp1, tmp2 - cbnz has_nul1, .Lfp_le8 + cbnz has_nul1, L(fp_le8) bic has_nul2, tmp3, tmp4 - b .Lfp_gt8 + b L(fp_gt8) - .size STRCPY, . - STRCPY +END (STRCPY) diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S index 26388d7..2293f73 100644 --- a/string/aarch64/strlen.S +++ b/string/aarch64/strlen.S @@ -10,6 +10,8 @@ * ARMv8-a, AArch64, unaligned accesses, min page size 4k. */ +#include "../asmdefs.h" + /* To test the page crossing code path more thoroughly, compile with -DTEST_PAGE_CROSS - this will force all calls through the slower entry path. This option is not intended for production use. */ @@ -30,16 +32,6 @@ #define tmp4 x7 #define zeroones x8 -#define L(l) .L ## l - - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. A faster check @@ -81,7 +73,7 @@ whether the first fetch, which may be misaligned, crosses a page boundary. */ -def_fn __strlen_aarch64 p2align=6 +ENTRY (__strlen_aarch64) and tmp1, srcin, MIN_PAGE_SIZE - 1 mov zeroones, REP8_01 cmp tmp1, MIN_PAGE_SIZE - 16 @@ -122,7 +114,7 @@ L(main_loop_entry): sub src, src, 16 L(main_loop): ldp data1, data2, [src, 32]! -.Lpage_cross_entry: +L(page_cross_entry): sub tmp1, data1, zeroones sub tmp3, data2, zeroones orr tmp2, tmp1, tmp3 @@ -211,4 +203,4 @@ L(page_cross): csel data2, data2, tmp2, eq b L(page_cross_entry) - .size __strlen_aarch64, . - __strlen_aarch64 +END (__strlen_aarch64) diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S index ced72b9..fbd08ee 100644 --- a/string/aarch64/strncmp.S +++ b/string/aarch64/strncmp.S @@ -10,13 +10,7 @@ * ARMv8-a, AArch64 */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm +#include "../asmdefs.h" #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f @@ -51,14 +45,14 @@ .rep 7 nop /* Pad so that the loop below fits a cache line. */ .endr -def_fn __strncmp_aarch64 - cbz limit, .Lret0 +ENTRY_ALIGN (__strncmp_aarch64, 0) + cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 and count, src1, #7 - b.ne .Lmisaligned8 - cbnz count, .Lmutual_align + b.ne L(misaligned8) + cbnz count, L(mutual_align) /* Calculate the number of full and partial words -1. */ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ @@ -67,10 +61,10 @@ def_fn __strncmp_aarch64 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. */ /* Start of performance-critical section -- one 64B cache line. */ -.Lloop_aligned: +L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 -.Lstart_realigned: +L(start_realigned): subs limit_wd, limit_wd, #1 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f @@ -78,15 +72,15 @@ def_fn __strncmp_aarch64 csinv endloop, diff, xzr, pl /* Last Dword or differences. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq - b.eq .Lloop_aligned + b.eq L(loop_aligned) /* End of performance-critical section -- one 64B cache line. */ /* Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, .Lnot_limit + tbz limit_wd, #63, L(not_limit) /* Limit % 8 == 0 => all bytes significant. */ ands limit, limit, #7 - b.eq .Lnot_limit + b.eq L(not_limit) lsl limit, limit, #3 /* Bits -> bytes. */ mov mask, #~0 @@ -101,7 +95,7 @@ def_fn __strncmp_aarch64 /* Make sure that the NUL byte is marked in the syndrome. */ orr has_nul, has_nul, mask -.Lnot_limit: +L(not_limit): orr syndrome, diff, has_nul #ifndef __AARCH64EB__ @@ -154,7 +148,7 @@ def_fn __strncmp_aarch64 ret #endif -.Lmutual_align: +L(mutual_align): /* Sources are mutually aligned, but are not currently at an alignment boundary. Round down the addresses and then mask off the bytes that precede the start point. @@ -182,56 +176,56 @@ def_fn __strncmp_aarch64 orr data1, data1, tmp2 orr data2, data2, tmp2 add limit_wd, limit_wd, tmp3, lsr #3 - b .Lstart_realigned + b L(start_realigned) .p2align 6 /* Don't bother with dwords for up to 16 bytes. */ -.Lmisaligned8: +L(misaligned8): cmp limit, #16 - b.hs .Ltry_misaligned_words + b.hs L(try_misaligned_words) -.Lbyte_loop: +L(byte_loop): /* Perhaps we can do better than this. */ ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 subs limit, limit, #1 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq .Lbyte_loop -.Ldone: + b.eq L(byte_loop) +L(done): sub result, data1, data2 ret /* Align the SRC1 to a dword by doing a bytewise compare and then do the dword loop. */ -.Ltry_misaligned_words: +L(try_misaligned_words): lsr limit_wd, limit, #3 - cbz count, .Ldo_misaligned + cbz count, L(do_misaligned) neg count, count and count, count, #7 sub limit, limit, count lsr limit_wd, limit, #3 -.Lpage_end_loop: +L(page_end_loop): ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 cmp data1w, #1 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.ne .Ldone + b.ne L(done) subs count, count, #1 - b.hi .Lpage_end_loop + b.hi L(page_end_loop) -.Ldo_misaligned: +L(do_misaligned): /* Prepare ourselves for the next page crossing. Unlike the aligned loop, we fetch 1 less dword because we risk crossing bounds on SRC2. */ mov count, #8 subs limit_wd, limit_wd, #1 - b.lo .Ldone_loop -.Lloop_misaligned: + b.lo L(done_loop) +L(loop_misaligned): and tmp2, src2, #0xff8 eor tmp2, tmp2, #0xff8 - cbz tmp2, .Lpage_end_loop + cbz tmp2, L(page_end_loop) ldr data1, [src1], #8 ldr data2, [src2], #8 @@ -240,14 +234,14 @@ def_fn __strncmp_aarch64 eor diff, data1, data2 /* Non-zero if differences found. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp diff, #0, #0, eq - b.ne .Lnot_limit + b.ne L(not_limit) subs limit_wd, limit_wd, #1 - b.pl .Lloop_misaligned + b.pl L(loop_misaligned) -.Ldone_loop: +L(done_loop): /* We found a difference or a NULL before the limit was reached. */ and limit, limit, #7 - cbz limit, .Lnot_limit + cbz limit, L(not_limit) /* Read the last word. */ sub src1, src1, 8 sub src2, src2, 8 @@ -258,9 +252,10 @@ def_fn __strncmp_aarch64 eor diff, data1, data2 /* Non-zero if differences found. */ bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp diff, #0, #0, eq - b.ne .Lnot_limit + b.ne L(not_limit) -.Lret0: +L(ret0): mov result, #0 ret - .size __strncmp_aarch64, . - __strncmp_aarch64 + +END ( __strncmp_aarch64) diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S index b02c846..df66b60 100644 --- a/string/aarch64/strnlen.S +++ b/string/aarch64/strnlen.S @@ -10,6 +10,8 @@ * ARMv8-a, AArch64 */ +#include "../asmdefs.h" + /* Arguments and results. */ #define srcin x0 #define len x0 @@ -30,36 +32,28 @@ #define pos x13 #define limit_wd x14 - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 .text .p2align 6 -.Lstart: +L(start): /* Pre-pad to ensure critical loop begins an icache line. */ .rep 7 nop .endr /* Put this code here to avoid wasting more space with pre-padding. */ -.Lhit_limit: +L(hit_limit): mov len, limit ret -def_fn __strnlen_aarch64 - cbz limit, .Lhit_limit +ENTRY_ALIGN (__strnlen_aarch64, 0) + cbz limit, L(hit_limit) mov zeroones, #REP8_01 bic src, srcin, #15 ands tmp1, srcin, #15 - b.ne .Lmisaligned + b.ne L(misaligned) /* Calculate the number of full and partial words -1. */ sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ @@ -73,9 +67,9 @@ def_fn __strnlen_aarch64 cycle, as we get much better parallelism out of the operations. */ /* Start of critial section -- keep to one 64Byte cache line. */ -.Lloop: +L(loop): ldp data1, data2, [src], #16 -.Lrealigned: +L(realigned): sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f sub tmp3, data2, zeroones @@ -85,24 +79,24 @@ def_fn __strnlen_aarch64 subs limit_wd, limit_wd, #1 orr tmp1, has_nul1, has_nul2 ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ - b.eq .Lloop + b.eq L(loop) /* End of critical section -- keep to one 64Byte cache line. */ orr tmp1, has_nul1, has_nul2 - cbz tmp1, .Lhit_limit /* No null in final Qword. */ + cbz tmp1, L(hit_limit) /* No null in final Qword. */ /* We know there's a null in the final Qword. The easiest thing to do now is work out the length of the string and return MIN (len, limit). */ sub len, src, srcin - cbz has_nul1, .Lnul_in_data2 + cbz has_nul1, L(nul_in_data2) #ifdef __AARCH64EB__ mov data2, data1 #endif sub len, len, #8 mov has_nul2, has_nul1 -.Lnul_in_data2: +L(nul_in_data2): #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul directly. The @@ -121,7 +115,7 @@ def_fn __strnlen_aarch64 csel len, len, limit, ls /* Return the lower value. */ ret -.Lmisaligned: +L(misaligned): /* Deal with a partial first word. We're doing two things in parallel here; 1) Calculate the number of words (but avoiding overflow if @@ -156,5 +150,6 @@ def_fn __strnlen_aarch64 csinv data1, data1, xzr, le csel data2, data2, data2a, le - b .Lrealigned - .size __strnlen_aarch64, . - .Lstart /* Include pre-padding in size. */ + b L(realigned) + +END (__strnlen_aarch64) diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S new file mode 100644 index 0000000..1b4caac --- /dev/null +++ b/string/aarch64/strrchr.S @@ -0,0 +1,147 @@ +/* + * strrchr - find last position of a character in a string. + * + * Copyright (c) 2014-2020, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Neon Available. + */ + +#include "../asmdefs.h" + +/* Arguments and results. */ +#define srcin x0 +#define chrin w1 + +#define result x0 + +#define src x2 +#define tmp1 x3 +#define wtmp2 w4 +#define tmp3 x5 +#define src_match x6 +#define src_offset x7 +#define const_m1 x8 +#define tmp4 x9 +#define nul_match x10 +#define chr_match x11 + +#define vrepchr v0 +#define vdata1 v1 +#define vdata2 v2 +#define vhas_nul1 v3 +#define vhas_nul2 v4 +#define vhas_chr1 v5 +#define vhas_chr2 v6 +#define vrepmask_0 v7 +#define vrepmask_c v16 +#define vend1 v17 +#define vend2 v18 + +/* Core algorithm. + + For each 32-byte hunk we calculate a 64-bit syndrome value, with + two bits per byte (LSB is always in bits 0 and 1, for both big + and little-endian systems). For each tuple, bit 0 is set iff + the relevant byte matched the requested character; bit 1 is set + iff the relevant byte matched the NUL end of string (we trigger + off bit0 for the special case of looking for NUL). Since the bits + in the syndrome reflect exactly the order in which things occur + in the original string a count_trailing_zeros() operation will + identify exactly which byte is causing the termination, and why. */ + +ENTRY (__strrchr_aarch64) + /* Magic constant 0x40100401 to allow us to identify which lane + matches the requested byte. Magic constant 0x80200802 used + similarly for NUL termination. */ + mov wtmp2, #0x0401 + movk wtmp2, #0x4010, lsl #16 + dup vrepchr.16b, chrin + bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ + dup vrepmask_c.4s, wtmp2 + mov src_offset, #0 + ands tmp1, srcin, #31 + add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */ + b.eq L(aligned) + + /* Input string is not 32-byte aligned. Rather than forcing + the padding bytes to a safe value, we calculate the syndrome + for all the bytes, but then mask off those bits of the + syndrome that are related to the padding. */ + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + neg tmp1, tmp1 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vhas_nul1.d[0] + lsl tmp1, tmp1, #1 + mov const_m1, #~0 + mov chr_match, vhas_chr1.d[0] + lsr tmp3, const_m1, tmp1 + + bic nul_match, nul_match, tmp3 // Mask padding bits. + bic chr_match, chr_match, tmp3 // Mask padding bits. + cbnz nul_match, L(tail) + +L(loop): + cmp chr_match, #0 + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne +L(aligned): + ld1 {vdata1.16b, vdata2.16b}, [src], #32 + cmeq vhas_nul1.16b, vdata1.16b, #0 + cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b + cmeq vhas_nul2.16b, vdata2.16b, #0 + cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b + addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128 + and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b + and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 + addp vend1.16b, vend1.16b, vend1.16b // 128->64 + addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64 + mov nul_match, vend1.d[0] + mov chr_match, vhas_chr1.d[0] + cbz nul_match, L(loop) + + and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b + and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b + addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b + mov nul_match, vhas_nul1.d[0] + +L(tail): + /* Work out exactly where the string ends. */ + sub tmp4, nul_match, #1 + eor tmp4, tmp4, nul_match + ands chr_match, chr_match, tmp4 + /* And pick the values corresponding to the last match. */ + csel src_match, src, src_match, ne + csel src_offset, chr_match, src_offset, ne + + /* Count down from the top of the syndrome to find the last match. */ + clz tmp3, src_offset + /* Src_match points beyond the word containing the match, so we can + simply subtract half the bit-offset into the syndrome. Because + we are counting down, we need to go back one more character. */ + add tmp3, tmp3, #2 + sub result, src_match, tmp3, lsr #1 + /* But if the syndrome shows no match was found, then return NULL. */ + cmp src_offset, #0 + csel result, result, xzr, ne + + ret + +END (__strrchr_aarch64) diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S index 3346e4f..aab78a2 100644 --- a/string/arm/memcpy.S +++ b/string/arm/memcpy.S @@ -17,6 +17,8 @@ */ +#include "../asmdefs.h" + .syntax unified /* This implementation requires ARM state. */ .arm @@ -118,23 +120,15 @@ .endm #endif - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - -def_fn __memcpy_arm p2align=6 +ENTRY (__memcpy_arm) mov dst, dstin /* Preserve dstin, we need to return it. */ cmp count, #64 - bge .Lcpy_not_short + bge L(cpy_not_short) /* Deal with small copies quickly by dropping straight into the exit block. */ -.Ltail63unaligned: +L(tail63unaligned): #ifdef USE_NEON and tmp1, count, #0x38 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) @@ -213,13 +207,13 @@ def_fn __memcpy_arm p2align=6 strbne src, [dst] bx lr -.Lcpy_not_short: +L(cpy_not_short): /* At least 64 bytes to copy, but don't know the alignment yet. */ str tmp2, [sp, #-FRAME_SIZE]! and tmp2, src, #7 and tmp1, dst, #7 cmp tmp1, tmp2 - bne .Lcpy_notaligned + bne L(cpy_notaligned) #ifdef USE_VFP /* Magic dust alert! Force VFP on Cortex-A9. Experiments show @@ -245,12 +239,12 @@ def_fn __memcpy_arm p2align=6 1: subs tmp2, count, #64 /* Use tmp2 for count. */ - blt .Ltail63aligned + blt L(tail63aligned) cmp tmp2, #512 - bge .Lcpy_body_long + bge L(cpy_body_long) -.Lcpy_body_medium: /* Count in tmp2. */ +L(cpy_body_medium): /* Count in tmp2. */ #ifdef USE_VFP 1: vldr d0, [src, #0] @@ -274,9 +268,9 @@ def_fn __memcpy_arm p2align=6 add dst, dst, #64 bge 1b tst tmp2, #0x3f - beq .Ldone + beq L(done) -.Ltail63aligned: /* Count in tmp2. */ +L(tail63aligned): /* Count in tmp2. */ and tmp1, tmp2, #0x38 add dst, dst, tmp1 add src, src, tmp1 @@ -327,7 +321,7 @@ def_fn __memcpy_arm p2align=6 add src, src, #8 add dst, dst, #8 -.Ltail63aligned: /* Count in tmp2. */ +L(tail63aligned): /* Count in tmp2. */ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but we know that the src and dest are 64-bit aligned so we can use LDRD/STRD to improve efficiency. */ @@ -364,11 +358,11 @@ def_fn __memcpy_arm p2align=6 strhcs tmp1, [dst], #2 strbne tmp2, [dst] -.Ldone: +L(done): ldr tmp2, [sp], #FRAME_SIZE bx lr -.Lcpy_body_long: /* Count in tmp2. */ +L(cpy_body_long): /* Count in tmp2. */ /* Long copy. We know that there's at least (prefetch_lines * 64) bytes to go. */ @@ -425,7 +419,7 @@ def_fn __memcpy_arm p2align=6 vstr d2, [dst, #64 + 56] add dst, dst, #128 add tmp2, tmp2, #prefetch_lines * 64 - b .Lcpy_body_medium + b L(cpy_body_medium) #else /* Long copy. Use an SMS style loop to maximize the I/O bandwidth of the core. We don't have enough spare registers @@ -479,12 +473,12 @@ def_fn __memcpy_arm p2align=6 ldrd D_l, D_h, [sp, #24] add dst, dst, #72 tst tmp2, #0x3f - bne .Ltail63aligned + bne L(tail63aligned) ldr tmp2, [sp], #FRAME_SIZE bx lr #endif -.Lcpy_notaligned: +L(cpy_notaligned): pld [src] pld [src, #64] /* There's at least 64 bytes to copy, but there is no mutual @@ -506,7 +500,7 @@ def_fn __memcpy_arm p2align=6 pld [src, #(3 * 64)] subs count, count, #64 ldrmi tmp2, [sp], #FRAME_SIZE - bmi .Ltail63unaligned + bmi L(tail63unaligned) pld [src, #(4 * 64)] #ifdef USE_NEON @@ -587,7 +581,7 @@ def_fn __memcpy_arm p2align=6 ands count, tmp2, #0x3f #endif ldr tmp2, [sp], #FRAME_SIZE - bne .Ltail63unaligned + bne L(tail63unaligned) bx lr - .size __memcpy_arm, . - __memcpy_arm +END (__memcpy_arm) diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S index 5ea06c9..d615231 100644 --- a/string/arm/strcmp-armv6m.S +++ b/string/arm/strcmp-armv6m.S @@ -26,12 +26,7 @@ DoSub \n, \label .endm - .text - .p2align 0 - .global __strcmp_armv6m - .type __strcmp_armv6m, %function -__strcmp_armv6m: - .cfi_startproc +ENTRY_ALIGN (__strcmp_armv6m, 4) mov r2, r0 push {r4, r5, r6, lr} orrs r2, r1 @@ -114,5 +109,5 @@ __strcmp_armv6m: 7: subs r0, r2, r3 pop {r4, r5, r6, pc} - .cfi_endproc - .size __strcmp_armv6m, . - __strcmp_armv6m + +END (__strcmp_armv6m) diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S index fb9cae3..295db8b 100644 --- a/string/arm/strcmp.S +++ b/string/arm/strcmp.S @@ -10,6 +10,8 @@ is sufficiently aligned. Use saturating arithmetic to optimize the compares. */ +#include "../asmdefs.h" + /* Build Options: STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first byte in the string. If comparing completely random strings @@ -48,14 +50,6 @@ #define LSB 0x000000ff #endif /* not __ARM_BIG_ENDIAN */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm - /* Parameters and result. */ #define src1 r0 #define src2 r1 @@ -131,23 +125,22 @@ .text .p2align 5 -.Lstrcmp_start_addr: +L(strcmp_start_addr): #if STRCMP_NO_PRECHECK == 0 -.Lfastpath_exit: +L(fastpath_exit): sub r0, r2, r3 bx lr nop #endif -def_fn __strcmp_arm +ENTRY_ALIGN (__strcmp_arm, 0) #if STRCMP_NO_PRECHECK == 0 ldrb r2, [src1] ldrb r3, [src2] cmp r2, #1 it cs cmpcs r2, r3 - bne .Lfastpath_exit + bne L(fastpath_exit) #endif - .cfi_startproc strd r4, r5, [sp, #-16]! .cfi_def_cfa_offset 16 .cfi_offset 4, -16 @@ -158,12 +151,12 @@ def_fn __strcmp_arm .cfi_offset 7, -4 mvn const_m1, #0 lsl r2, tmp1, #29 - cbz r2, .Lloop_aligned8 + cbz r2, L(loop_aligned8) -.Lnot_aligned: +L(not_aligned): eor tmp1, src1, src2 tst tmp1, #7 - bne .Lmisaligned8 + bne L(misaligned8) /* Deal with mutual misalignment by aligning downwards and then masking off the unwanted loaded data to prevent a difference. */ @@ -180,29 +173,29 @@ def_fn __strcmp_arm S2HI tmp1, const_m1, tmp2 orn data1a, data1a, tmp1 orn data2a, data2a, tmp1 - beq .Lstart_realigned8 + beq L(start_realigned8) orn data1b, data1b, tmp1 mov data1a, const_m1 orn data2b, data2b, tmp1 mov data2a, const_m1 - b .Lstart_realigned8 + b L(start_realigned8) /* Unwind the inner loop by a factor of 2, giving 16 bytes per pass. */ .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ .p2align 2 /* Always word aligned. */ -.Lloop_aligned8: +L(loop_aligned8): ldrd data1a, data1b, [src1], #16 ldrd data2a, data2b, [src2], #16 -.Lstart_realigned8: +L(start_realigned8): uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ eor syndrome_a, data1a, data2a sel syndrome_a, syndrome_a, const_m1 - cbnz syndrome_a, .Ldiff_in_a + cbnz syndrome_a, L(diff_in_a) uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ eor syndrome_b, data1b, data2b sel syndrome_b, syndrome_b, const_m1 - cbnz syndrome_b, .Ldiff_in_b + cbnz syndrome_b, L(diff_in_b) ldrd data1a, data1b, [src1, #-8] ldrd data2a, data2b, [src2, #-8] @@ -214,47 +207,47 @@ def_fn __strcmp_arm sel syndrome_b, syndrome_b, const_m1 /* Can't use CBZ for backwards branch. */ orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ - beq .Lloop_aligned8 + beq L(loop_aligned8) -.Ldiff_found: - cbnz syndrome_a, .Ldiff_in_a +L(diff_found): + cbnz syndrome_a, L(diff_in_a) -.Ldiff_in_b: +L(diff_in_b): strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 -.Ldiff_in_a: +L(diff_in_a): .cfi_restore_state strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 .cfi_restore_state -.Lmisaligned8: +L(misaligned8): tst tmp1, #3 - bne .Lmisaligned4 + bne L(misaligned4) ands tmp1, src1, #3 - bne .Lmutual_align4 + bne L(mutual_align4) /* Unrolled by a factor of 2, to reduce the number of post-increment operations. */ -.Lloop_aligned4: +L(loop_aligned4): ldr data1, [src1], #8 ldr data2, [src2], #8 -.Lstart_realigned4: +L(start_realigned4): uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ eor syndrome, data1, data2 sel syndrome, syndrome, const_m1 - cbnz syndrome, .Laligned4_done + cbnz syndrome, L(aligned4_done) ldr data1, [src1, #-4] ldr data2, [src2, #-4] uadd8 syndrome, data1, const_m1 eor syndrome, data1, data2 sel syndrome, syndrome, const_m1 cmp syndrome, #0 - beq .Lloop_aligned4 + beq L(loop_aligned4) -.Laligned4_done: +L(aligned4_done): strcmp_epilogue_aligned syndrome, data1, data2, 0 -.Lmutual_align4: +L(mutual_align4): .cfi_restore_state /* Deal with mutual misalignment by aligning downwards and then masking off the unwanted loaded data to prevent a difference. */ @@ -269,57 +262,57 @@ def_fn __strcmp_arm S2HI tmp1, const_m1, tmp1 orn data1, data1, tmp1 orn data2, data2, tmp1 - b .Lstart_realigned4 + b L(start_realigned4) -.Lmisaligned4: +L(misaligned4): ands tmp1, src1, #3 - beq .Lsrc1_aligned + beq L(src1_aligned) sub src2, src2, tmp1 bic src1, src1, #3 lsls tmp1, tmp1, #31 ldr data1, [src1], #4 - beq .Laligned_m2 - bcs .Laligned_m1 + beq L(aligned_m2) + bcs L(aligned_m1) #if STRCMP_NO_PRECHECK == 1 ldrb data2, [src2, #1] uxtb tmp1, data1, ror #BYTE1_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) -.Laligned_m2: +L(aligned_m2): ldrb data2, [src2, #2] uxtb tmp1, data1, ror #BYTE2_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) -.Laligned_m1: +L(aligned_m1): ldrb data2, [src2, #3] uxtb tmp1, data1, ror #BYTE3_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit + bne L(misaligned_exit) add src2, src2, #4 - cbnz data2, .Lsrc1_aligned + cbnz data2, L(src1_aligned) #else /* STRCMP_NO_PRECHECK */ /* If we've done the pre-check, then we don't need to check the first byte again here. */ ldrb data2, [src2, #2] uxtb tmp1, data1, ror #BYTE2_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbz data2, .Lmisaligned_exit + bne L(misaligned_exit) + cbz data2, L(misaligned_exit) -.Laligned_m2: +L(aligned_m2): ldrb data2, [src2, #3] uxtb tmp1, data1, ror #BYTE3_OFFSET subs tmp1, tmp1, data2 - bne .Lmisaligned_exit - cbnz data2, .Laligned_m1 + bne L(misaligned_exit) + cbnz data2, L(aligned_m1) #endif -.Lmisaligned_exit: +L(misaligned_exit): .cfi_remember_state mov result, tmp1 ldr r4, [sp], #16 @@ -327,10 +320,10 @@ def_fn __strcmp_arm bx lr #if STRCMP_NO_PRECHECK == 0 -.Laligned_m1: +L(aligned_m1): add src2, src2, #4 #endif -.Lsrc1_aligned: +L(src1_aligned): .cfi_restore_state /* src1 is word aligned, but src2 has no common alignment with it. */ @@ -339,11 +332,11 @@ def_fn __strcmp_arm bic src2, src2, #3 ldr data2, [src2], #4 - bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ - bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ + bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */ + bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */ /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ -.Loverlap3: +L(overlap3): bic tmp1, data1, #MSB uadd8 syndrome, data1, const_m1 eors syndrome, tmp1, data2, S2LO #8 @@ -355,14 +348,14 @@ def_fn __strcmp_arm cmp tmp1, data2, S2HI #24 bne 6f ldr data1, [src1], #4 - b .Loverlap3 + b L(overlap3) 4: S2LO data2, data2, #8 - b .Lstrcmp_tail + b L(strcmp_tail) 5: bics syndrome, syndrome, #MSB - bne .Lstrcmp_done_equal + bne L(strcmp_done_equal) /* We can only get here if the MSB of data1 contains 0, so fast-path the exit. */ @@ -381,10 +374,10 @@ def_fn __strcmp_arm .cfi_restore_state S2LO data1, data1, #24 and data2, data2, #LSB - b .Lstrcmp_tail + b L(strcmp_tail) .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap2: +L(overlap2): and tmp1, data1, const_m1, S2LO #16 uadd8 syndrome, data1, const_m1 eors syndrome, tmp1, data2, S2LO #16 @@ -396,28 +389,28 @@ def_fn __strcmp_arm cmp tmp1, data2, S2HI #16 bne 6f ldr data1, [src1], #4 - b .Loverlap2 + b L(overlap2) 4: S2LO data2, data2, #16 - b .Lstrcmp_tail + b L(strcmp_tail) 5: ands syndrome, syndrome, const_m1, S2LO #16 - bne .Lstrcmp_done_equal + bne L(strcmp_done_equal) ldrh data2, [src2] S2LO data1, data1, #16 #ifdef __ARM_BIG_ENDIAN lsl data2, data2, #16 #endif - b .Lstrcmp_tail + b L(strcmp_tail) 6: S2LO data1, data1, #16 and data2, data2, const_m1, S2LO #16 - b .Lstrcmp_tail + b L(strcmp_tail) .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ -.Loverlap1: +L(overlap1): and tmp1, data1, #LSB uadd8 syndrome, data1, const_m1 eors syndrome, tmp1, data2, S2LO #24 @@ -429,20 +422,20 @@ def_fn __strcmp_arm cmp tmp1, data2, S2HI #8 bne 6f ldr data1, [src1], #4 - b .Loverlap1 + b L(overlap1) 4: S2LO data2, data2, #24 - b .Lstrcmp_tail + b L(strcmp_tail) 5: tst syndrome, #LSB - bne .Lstrcmp_done_equal + bne L(strcmp_done_equal) ldr data2, [src2] 6: S2LO data1, data1, #8 bic data2, data2, #MSB - b .Lstrcmp_tail + b L(strcmp_tail) -.Lstrcmp_done_equal: +L(strcmp_done_equal): mov result, #0 .cfi_remember_state ldrd r4, r5, [sp], #16 @@ -453,7 +446,7 @@ def_fn __strcmp_arm .cfi_restore 7 bx lr -.Lstrcmp_tail: +L(strcmp_tail): .cfi_restore_state #ifndef __ARM_BIG_ENDIAN rev data1, data1 @@ -475,5 +468,5 @@ def_fn __strcmp_arm .cfi_restore 7 sub result, result, data2, lsr #24 bx lr - .cfi_endproc - .size __strcmp, . - .Lstrcmp_start_addr + +END (__strcmp_arm) diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S index 279ec87..76e6930 100644 --- a/string/arm/strlen-armv6t2.S +++ b/string/arm/strlen-armv6t2.S @@ -11,13 +11,7 @@ */ - .macro def_fn f p2align=0 - .text - .p2align \p2align - .global \f - .type \f, %function -\f: - .endm +#include "../asmdefs.h" #ifdef __ARMEB__ #define S2LO lsl @@ -44,27 +38,27 @@ #define tmp1 r4 /* Overlaps const_0 */ #define tmp2 r5 -def_fn __strlen_armv6t2 p2align=6 +ENTRY (__strlen_armv6t2) pld [srcin, #0] strd r4, r5, [sp, #-8]! bic src, srcin, #7 mvn const_m1, #0 ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */ pld [src, #32] - bne.w .Lmisaligned8 + bne.w L(misaligned8) mov const_0, #0 mov result, #-8 -.Lloop_aligned: +L(loop_aligned): /* Bytes 0-7. */ ldrd data1a, data1b, [src] pld [src, #64] add result, result, #8 -.Lstart_realigned: +L(start_realigned): uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */ sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ uadd8 data1b, data1b, const_m1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found + cbnz data1b, L(null_found) /* Bytes 8-15. */ ldrd data1a, data1b, [src, #8] @@ -73,7 +67,7 @@ def_fn __strlen_armv6t2 p2align=6 sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ uadd8 data1b, data1b, const_m1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found + cbnz data1b, L(null_found) /* Bytes 16-23. */ ldrd data1a, data1b, [src, #16] @@ -82,7 +76,7 @@ def_fn __strlen_armv6t2 p2align=6 sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */ uadd8 data1b, data1b, const_m1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ - cbnz data1b, .Lnull_found + cbnz data1b, L(null_found) /* Bytes 24-31. */ ldrd data1a, data1b, [src, #24] @@ -93,9 +87,9 @@ def_fn __strlen_armv6t2 p2align=6 uadd8 data1b, data1b, const_m1 sel data1b, data1a, const_m1 /* Only used if d1a == 0. */ cmp data1b, #0 - beq .Lloop_aligned + beq L(loop_aligned) -.Lnull_found: +L(null_found): cmp data1a, #0 itt eq addeq result, result, #4 @@ -108,7 +102,7 @@ def_fn __strlen_armv6t2 p2align=6 add result, result, data1a, lsr #3 /* Bits -> Bytes. */ bx lr -.Lmisaligned8: +L(misaligned8): ldrd data1a, data1b, [src] and tmp2, tmp1, #3 rsb result, tmp1, #0 @@ -121,5 +115,6 @@ def_fn __strlen_armv6t2 p2align=6 ornne data1b, data1b, tmp2 movne data1a, const_m1 mov const_0, #0 - b .Lstart_realigned - .size __strlen_armv6t2, . - __strlen_armv6t2 + b L(start_realigned) + +END (__strlen_armv6t2) diff --git a/string/include/stringlib.h b/string/include/stringlib.h index 3f60220..b3b6181 100644 --- a/string/include/stringlib.h +++ b/string/include/stringlib.h @@ -13,7 +13,6 @@ #endif #if __aarch64__ -void *__memcpy_bytewise (void *__restrict, const void *__restrict, size_t); void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t); void *__memmove_aarch64 (void *, const void *, size_t); void *__memset_aarch64 (void *, int, size_t); @@ -22,6 +21,7 @@ int __memcmp_aarch64 (const void *, const void *, size_t); char *__strcpy_aarch64 (char *__restrict, const char *__restrict); int __strcmp_aarch64 (const char *, const char *); char *__strchr_aarch64 (const char *, int); +char *__strrchr_aarch64 (const char *, int); char *__strchrnul_aarch64 (const char *, int ); size_t __strlen_aarch64 (const char *); size_t __strnlen_aarch64 (const char *, size_t); diff --git a/string/memcpy_bytewise.S b/string/memcpy_bytewise.S deleted file mode 100644 index 7ee3474..0000000 --- a/string/memcpy_bytewise.S +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Trivial AArch64 memcpy. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#if __aarch64__ -#include "asmdefs.h" - -ENTRY (__memcpy_bytewise) - cbz x2, 2f - mov x3, 0 -1: - ldrb w4, [x1, x3] - strb w4, [x0, x3] - add x3, x3, 1 - cmp x3, x2 - bne 1b -2: - ret -END (__memcpy_bytewise) -#endif diff --git a/string/strrchr.S b/string/strrchr.S index 18b1cf9..119b1d5 100644 --- a/string/strrchr.S +++ b/string/strrchr.S @@ -6,6 +6,7 @@ */ #if __aarch64__ +#include "aarch64/strrchr.S" # if __ARM_FEATURE_SVE #include "aarch64/strrchr-sve.S" # endif diff --git a/string/test/memcpy.c b/string/test/memcpy.c index e31f359..8572452 100644 --- a/string/test/memcpy.c +++ b/string/test/memcpy.c @@ -19,7 +19,6 @@ static const struct fun #define F(x) {#x, x}, F(memcpy) #if __aarch64__ -F(__memcpy_bytewise) F(__memcpy_aarch64) # if __ARM_NEON F(__memcpy_aarch64_simd) diff --git a/string/test/strrchr.c b/string/test/strrchr.c index 6277fae..532fa51 100644 --- a/string/test/strrchr.c +++ b/string/test/strrchr.c @@ -20,6 +20,7 @@ static const struct fun #define F(x) {#x, x}, F(strrchr) #if __aarch64__ +F(__strrchr_aarch64) # if __ARM_FEATURE_SVE F(__strrchr_aarch64_sve) # endif |