aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorandroid-build-team Robot <android-build-team-robot@google.com>2020-02-03 00:19:16 +0000
committerandroid-build-team Robot <android-build-team-robot@google.com>2020-02-03 00:19:16 +0000
commit2f2130a33c81427385e09479998bc036bea44a69 (patch)
tree783ed38b7d7eb9ac348244f97c590a1c6a7aad97
parentb14a0c18de5ddde3c32c69248be9f471d6b2b1eb (diff)
parent8edcec53c6d84dc7f85e4c0a8539384b3fe489ec (diff)
downloadarm-optimized-routines-android11-gsi.tar.gz
Snap for 6176706 from 8edcec53c6d84dc7f85e4c0a8539384b3fe489ec to rvc-releaseandroid-vts-11.0_r9android-vts-11.0_r8android-vts-11.0_r7android-vts-11.0_r6android-vts-11.0_r5android-vts-11.0_r4android-vts-11.0_r3android-vts-11.0_r2android-vts-11.0_r16android-vts-11.0_r15android-vts-11.0_r14android-vts-11.0_r13android-vts-11.0_r12android-vts-11.0_r11android-vts-11.0_r10android-vts-11.0_r1android-security-11.0.0_r76android-security-11.0.0_r75android-security-11.0.0_r74android-security-11.0.0_r73android-security-11.0.0_r72android-security-11.0.0_r71android-security-11.0.0_r70android-security-11.0.0_r69android-security-11.0.0_r68android-security-11.0.0_r67android-security-11.0.0_r66android-security-11.0.0_r65android-security-11.0.0_r64android-security-11.0.0_r63android-security-11.0.0_r62android-security-11.0.0_r61android-security-11.0.0_r60android-security-11.0.0_r59android-security-11.0.0_r58android-security-11.0.0_r57android-security-11.0.0_r56android-security-11.0.0_r55android-security-11.0.0_r54android-security-11.0.0_r53android-security-11.0.0_r52android-security-11.0.0_r51android-security-11.0.0_r50android-security-11.0.0_r49android-security-11.0.0_r1android-platform-11.0.0_r9android-platform-11.0.0_r8android-platform-11.0.0_r7android-platform-11.0.0_r6android-platform-11.0.0_r5android-platform-11.0.0_r40android-platform-11.0.0_r4android-platform-11.0.0_r39android-platform-11.0.0_r38android-platform-11.0.0_r37android-platform-11.0.0_r36android-platform-11.0.0_r35android-platform-11.0.0_r34android-platform-11.0.0_r33android-platform-11.0.0_r32android-platform-11.0.0_r31android-platform-11.0.0_r30android-platform-11.0.0_r3android-platform-11.0.0_r29android-platform-11.0.0_r28android-platform-11.0.0_r27android-platform-11.0.0_r26android-platform-11.0.0_r25android-platform-11.0.0_r24android-platform-11.0.0_r23android-platform-11.0.0_r22android-platform-11.0.0_r21android-platform-11.0.0_r20android-platform-11.0.0_r2android-platform-11.0.0_r19android-platform-11.0.0_r18android-platform-11.0.0_r17android-platform-11.0.0_r16android-platform-11.0.0_r15android-platform-11.0.0_r14android-platform-11.0.0_r13android-platform-11.0.0_r12android-platform-11.0.0_r11android-platform-11.0.0_r10android-platform-11.0.0_r1android-cts-11.0_r9android-cts-11.0_r8android-cts-11.0_r7android-cts-11.0_r6android-cts-11.0_r5android-cts-11.0_r4android-cts-11.0_r3android-cts-11.0_r2android-cts-11.0_r16android-cts-11.0_r15android-cts-11.0_r14android-cts-11.0_r13android-cts-11.0_r12android-cts-11.0_r11android-cts-11.0_r10android-cts-11.0_r1android-11.0.0_r6android-11.0.0_r5android-11.0.0_r4android-11.0.0_r3android-11.0.0_r25android-11.0.0_r2android-11.0.0_r17android-11.0.0_r1android11-tests-releaseandroid11-security-releaseandroid11-s1-releaseandroid11-releaseandroid11-platform-releaseandroid11-gsi
Change-Id: I159f39c18b7ed35f3f77421afbe3653225426fd8
-rw-r--r--METADATA8
-rw-r--r--README6
-rw-r--r--contributor-agreement.pdfbin0 -> 104680 bytes
-rw-r--r--math/include/mathlib.h4
-rw-r--r--math/pow.c4
-rw-r--r--math/s_pow.c6
-rw-r--r--math/test/mathbench.c28
-rwxr-xr-xmath/test/runulp.sh76
-rw-r--r--math/test/ulp.c19
-rw-r--r--math/v_math.h30
-rw-r--r--math/v_pow.c27
-rw-r--r--math/vn_pow.c12
-rw-r--r--string/aarch64/memchr.S38
-rw-r--r--string/aarch64/memcmp.S14
-rw-r--r--string/aarch64/memset.S15
-rw-r--r--string/aarch64/strchr.S24
-rw-r--r--string/aarch64/strchrnul.S24
-rw-r--r--string/aarch64/strcmp.S15
-rw-r--r--string/aarch64/strcpy.S54
-rw-r--r--string/aarch64/strlen.S18
-rw-r--r--string/aarch64/strncmp.S75
-rw-r--r--string/aarch64/strnlen.S39
-rw-r--r--string/aarch64/strrchr.S147
-rw-r--r--string/arm/memcpy.S48
-rw-r--r--string/arm/strcmp-armv6m.S11
-rw-r--r--string/arm/strcmp.S151
-rw-r--r--string/arm/strlen-armv6t2.S33
-rw-r--r--string/include/stringlib.h2
-rw-r--r--string/memcpy_bytewise.S23
-rw-r--r--string/strrchr.S1
-rw-r--r--string/test/memcpy.c1
-rw-r--r--string/test/strrchr.c1
32 files changed, 598 insertions, 356 deletions
diff --git a/METADATA b/METADATA
index 73c448b..94791ae 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
type: GIT
value: "https://github.com/ARM-software/optimized-routines.git"
}
- version: "3377796fe24ff1d5396609205426402678208eb1"
+ version: "33ba19089a261964e1e84ba4edf90263b468c161"
license_type: NOTICE
last_upgrade_date {
- year: 2019
- month: 12
- day: 26
+ year: 2020
+ month: 2
+ day: 1
}
}
diff --git a/README b/README
index 76fe018..440f08a 100644
--- a/README
+++ b/README
@@ -3,8 +3,10 @@ Arm Optimized Routines
This repository contains implementations of library functions
provided by Arm under MIT License (See LICENSE). Contributions
-to this project are accepted, but the terms will need negotiation (so
-relicensing and copyright assignment to the FSF is possible later).
+to this project are accepted, but Contributors have to sign an
+Assignment Agreement, please follow the instructions in
+contributor-agreement.pdf. This is needed so upstreaming code
+to projects that require copyright assignment is possible.
Regular quarterly releases are tagged as vYY.MM (e.g. v19.11).
diff --git a/contributor-agreement.pdf b/contributor-agreement.pdf
new file mode 100644
index 0000000..f42c3ae
--- /dev/null
+++ b/contributor-agreement.pdf
Binary files differ
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 254954a..4493008 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -36,6 +36,7 @@ double __s_sin (double);
double __s_cos (double);
double __s_exp (double);
double __s_log (double);
+double __s_pow (double, double);
#if __aarch64__
#if __GNUC__ >= 5
@@ -61,6 +62,7 @@ __f64x2_t __v_sin (__f64x2_t);
__f64x2_t __v_cos (__f64x2_t);
__f64x2_t __v_exp (__f64x2_t);
__f64x2_t __v_log (__f64x2_t);
+__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
#if __GNUC__ >= 9 || __clang_major__ >= 8
#define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -78,6 +80,7 @@ __vpcs __f64x2_t __vn_sin (__f64x2_t);
__vpcs __f64x2_t __vn_cos (__f64x2_t);
__vpcs __f64x2_t __vn_exp (__f64x2_t);
__vpcs __f64x2_t __vn_log (__f64x2_t);
+__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
/* Vector functions following the vector PCS using ABI names. */
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
@@ -90,6 +93,7 @@ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
#endif
#endif
diff --git a/math/pow.c b/math/pow.c
index 493488d..ced7c4f 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -347,7 +347,9 @@ pow (double x, double y)
if (topx == 0)
{
/* Normalize subnormal x so exponent becomes negative. */
- ix = asuint64 (x * 0x1p52);
+ /* Without the barrier some versions of clang evalutate the mul
+ unconditionally causing spurious overflow exceptions. */
+ ix = asuint64 (opt_barrier_double (x) * 0x1p52);
ix &= 0x7fffffffffffffff;
ix -= 52ULL << 52;
}
diff --git a/math/s_pow.c b/math/s_pow.c
new file mode 100644
index 0000000..2e34c9f
--- /dev/null
+++ b/math/s_pow.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_pow.c"
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 8d3ff1d..33ceda3 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -128,6 +128,18 @@ xy_Z_powf (v_float x)
{
return _ZGVnN4vv_powf (x, x);
}
+
+__vpcs static v_double
+xy__vn_pow (v_double x)
+{
+ return __vn_pow (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+ return _ZGVnN2vv_pow (x, x);
+}
#endif
static v_float
@@ -135,6 +147,12 @@ xy__v_powf (v_float x)
{
return __v_powf (x, x);
}
+
+static v_double
+xy__v_pow (v_double x)
+{
+ return __v_pow (x, x);
+}
#endif
static float
@@ -142,6 +160,12 @@ xy__s_powf (float x)
{
return __s_powf (x, x);
}
+
+static double
+xy__s_pow (double x)
+{
+ return __s_pow (x, x);
+}
#endif
static double
@@ -256,6 +280,7 @@ D (__s_sin, -3.1, 3.1)
D (__s_cos, -3.1, 3.1)
D (__s_exp, -9.9, 9.9)
D (__s_log, 0.01, 11.1)
+{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
F (__s_expf, -9.9, 9.9)
F (__s_expf_1u, -9.9, 9.9)
F (__s_exp2f, -9.9, 9.9)
@@ -270,6 +295,7 @@ VD (__v_sin, -3.1, 3.1)
VD (__v_cos, -3.1, 3.1)
VD (__v_exp, -9.9, 9.9)
VD (__v_log, 0.01, 11.1)
+{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
VF (__v_dummyf, 1.0, 2.0)
VF (__v_expf, -9.9, 9.9)
VF (__v_expf_1u, -9.9, 9.9)
@@ -285,6 +311,8 @@ VND (__vn_exp, -9.9, 9.9)
VND (_ZGVnN2v_exp, -9.9, 9.9)
VND (__vn_log, 0.01, 11.1)
VND (_ZGVnN2v_log, 0.01, 11.1)
+{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
VND (__vn_sin, -3.1, 3.1)
VND (_ZGVnN2v_sin, -3.1, 3.1)
VND (__vn_cos, -3.1, 3.1)
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 44393b8..a8c391b 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -45,6 +45,16 @@ t exp2 -0x1p-6 -0x1p6 40000
t exp2 633.3 733.3 10000
t exp2 -633.3 -777.3 10000
+L=0.02
+t log 0 0xffff000000000000 10000
+t log 0x1p-4 0x1p4 40000
+t log 0 inf 40000
+
+L=0.05
+t log2 0 0xffff000000000000 10000
+t log2 0x1p-4 0x1p4 40000
+t log2 0 inf 40000
+
L=0.05
t pow 0.5 2.0 x 0 inf 20000
t pow -0.5 -2.0 x 0 inf 20000
@@ -72,6 +82,16 @@ t exp2f 0 0xffff0000 10000
t exp2f 0x1p-14 0x1p8 50000
t exp2f -0x1p-14 -0x1p8 50000
+L=0.32
+t logf 0 0xffff0000 10000
+t logf 0x1p-4 0x1p4 50000
+t logf 0 inf 50000
+
+L=0.26
+t log2f 0 0xffff0000 10000
+t log2f 0x1p-4 0x1p4 50000
+t log2f 0 inf 50000
+
L=0.06
t sinf 0 0xffff0000 10000
t sinf 0x1p-14 0x1p54 50000
@@ -82,6 +102,16 @@ t cosf 0 0xffff0000 10000
t cosf 0x1p-14 0x1p54 50000
t cosf -0x1p-14 -0x1p54 50000
+L=0.06
+t sincosf_sinf 0 0xffff0000 10000
+t sincosf_sinf 0x1p-14 0x1p54 50000
+t sincosf_sinf -0x1p-14 -0x1p54 50000
+
+L=0.06
+t sincosf_cosf 0 0xffff0000 10000
+t sincosf_cosf 0x1p-14 0x1p54 50000
+t sincosf_cosf -0x1p-14 -0x1p54 50000
+
L=0.4
t powf 0x1p-1 0x1p1 x 0x1p-7 0x1p7 50000
t powf 0x1p-1 0x1p1 x -0x1p-7 -0x1p7 50000
@@ -110,6 +140,28 @@ range_exp='
-633.3 -777.3 10000
'
+range_log='
+ 0 0xffff000000000000 10000
+ 0x1p-4 0x1p4 400000
+ 0 inf 400000
+'
+
+range_pow='
+ 0x1p-1 0x1p1 x 0x1p-10 0x1p10 50000
+ 0x1p-1 0x1p1 x -0x1p-10 -0x1p10 50000
+ 0x1p-500 0x1p500 x 0x1p-1 0x1p1 50000
+ 0x1p-500 0x1p500 x -0x1p-1 -0x1p1 50000
+ 0x1.ep-1 0x1.1p0 x 0x1p8 0x1p16 50000
+ 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p16 50000
+'
+
+range_sin='
+ 0 0xffff000000000000 10000
+ 0x1p-4 0x1p4 400000
+ -0x1p-23 0x1p23 400000
+'
+range_cos="$range_sin"
+
range_expf='
0 0xffff0000 10000
0x1p-14 0x1p8 500000
@@ -143,6 +195,10 @@ range_powf='
# error limits
L_exp=1.9
+L_log=1.2
+L_pow=0.05
+L_sin=3.0
+L_cos=3.0
L_expf=1.49
L_expf_1u=0.4
L_exp2f=1.49
@@ -173,6 +229,26 @@ exp __v_exp $runv
exp __vn_exp $runvn
exp _ZGVnN2v_exp $runvn
+log __s_log $runs
+log __v_log $runv
+log __vn_log $runvn
+log _ZGVnN2v_log $runvn
+
+pow __s_pow $runs
+pow __v_pow $runv
+pow __vn_pow $runvn
+pow _ZGVnN2vv_pow $runvn
+
+sin __s_sin $runs
+sin __v_sin $runv
+sin __vn_sin $runvn
+sin _ZGVnN2v_sin $runvn
+
+cos __s_cos $runs
+cos __v_cos $runv
+cos __vn_cos $runvn
+cos _ZGVnN2v_cos $runvn
+
expf __s_expf $runs
expf __v_expf $runv
expf __vn_expf $runvn
diff --git a/math/test/ulp.c b/math/test/ulp.c
index b746080..371567a 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -214,6 +214,16 @@ struct conf
double errlim;
};
+/* Wrappers for sincos. */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
/* A bit of a hack: call vector functions twice with the same
input in lane 0 but a different value in other lanes: once
with an in-range value and then with a special case value. */
@@ -240,6 +250,7 @@ static double v_sin(double x) { return __v_sin(argd(x))[0]; }
static double v_cos(double x) { return __v_cos(argd(x))[0]; }
static double v_exp(double x) { return __v_exp(argd(x))[0]; }
static double v_log(double x) { return __v_log(argd(x))[0]; }
+static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
#ifdef __vpcs
static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
@@ -253,6 +264,7 @@ static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
static double vn_log(double x) { return __vn_log(argd(x))[0]; }
+static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
@@ -263,6 +275,7 @@ static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
#endif
#endif
@@ -311,6 +324,8 @@ static const struct fun fun[] = {
#define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
F1 (sin)
F1 (cos)
+ F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
+ F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
F1 (exp)
F1 (exp2)
F1 (log)
@@ -334,6 +349,7 @@ static const struct fun fun[] = {
F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
+ F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
#if __aarch64__
F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
@@ -347,6 +363,7 @@ static const struct fun fun[] = {
F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
#ifdef __vpcs
F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
@@ -360,6 +377,7 @@ static const struct fun fun[] = {
F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
@@ -370,6 +388,7 @@ static const struct fun fun[] = {
F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
#endif
#endif
#endif
diff --git a/math/v_math.h b/math/v_math.h
index 0861e98..3db22e5 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -249,6 +249,11 @@ v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
return f (x1, x2);
}
+static inline int
+v_lanes64 (void)
+{
+ return 1;
+}
static inline v_f64_t
v_f64 (f64_t x)
{
@@ -264,6 +269,16 @@ v_s64 (s64_t x)
{
return x;
}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+ return x;
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+ *x = v;
+}
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u64 (v_u64_t x)
@@ -506,6 +521,11 @@ v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
}
+static inline int
+v_lanes64 (void)
+{
+ return 2;
+}
static inline v_f64_t
v_f64 (f64_t x)
{
@@ -521,6 +541,16 @@ v_s64 (s64_t x)
{
return (v_s64_t){x, x};
}
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+ return x[i];
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+ (*x)[i] = v;
+}
/* true if any elements of a v_cond result is non-zero. */
static inline int
v_any_u64 (v_u64_t x)
diff --git a/math/v_pow.c b/math/v_pow.c
new file mode 100644
index 0000000..a209d57
--- /dev/null
+++ b/math/v_pow.c
@@ -0,0 +1,27 @@
+/*
+ * Double-precision vector pow function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+VPCS_ATTR
+v_f64_t
+V_NAME(pow) (v_f64_t x, v_f64_t y)
+{
+ v_f64_t z;
+ for (int lane = 0; lane < v_lanes64 (); lane++)
+ {
+ f64_t sx = v_get_f64 (x, lane);
+ f64_t sy = v_get_f64 (y, lane);
+ f64_t sz = pow (sx, sy);
+ v_set_f64 (&z, lane, sz);
+ }
+ return z;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/vn_pow.c b/math/vn_pow.c
new file mode 100644
index 0000000..2609501
--- /dev/null
+++ b/math/vn_pow.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_pow.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
+#include "v_pow.c"
+#endif
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index aff6e3d..10be49e 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -11,6 +11,8 @@
* Neon Available.
*/
+#include "../asmdefs.h"
+
/* Arguments and results. */
#define srcin x0
#define chrin w1
@@ -44,17 +46,9 @@
* identify exactly which byte has matched.
*/
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn __memchr_aarch64
+ENTRY (__memchr_aarch64)
/* Do not dereference srcin if no bytes to compare. */
- cbz cntin, .Lzero_length
+ cbz cntin, L(zero_length)
/*
* Magic constant 0x40100401 allows us to identify which lane matches
* the requested byte.
@@ -67,7 +61,7 @@ def_fn __memchr_aarch64
dup vrepmask.4s, wtmp2
ands soff, srcin, #31
and cntrem, cntin, #31
- b.eq .Lloop
+ b.eq L(loop)
/*
* Input string is not 32-byte aligned. We calculate the syndrome
@@ -90,25 +84,25 @@ def_fn __memchr_aarch64
lsr synd, synd, tmp
lsl synd, synd, tmp
/* The first block can also be the last */
- b.ls .Lmasklast
+ b.ls L(masklast)
/* Have we found something already? */
- cbnz synd, .Ltail
+ cbnz synd, L(tail)
-.Lloop:
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
subs cntin, cntin, #32
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
/* If we're out of data we finish regardless of the result */
- b.ls .Lend
+ b.ls L(end)
/* Use a fast check for the termination condition */
orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend.2d, vend.2d, vend.2d
mov synd, vend.d[0]
/* We're not out of data, loop if we haven't found the character */
- cbz synd, .Lloop
+ cbz synd, L(loop)
-.Lend:
+L(end):
/* Termination condition found, let's calculate the syndrome value */
and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
@@ -116,9 +110,9 @@ def_fn __memchr_aarch64
addp vend.16b, vend.16b, vend.16b /* 128->64 */
mov synd, vend.d[0]
/* Only do the clear for the last possible block */
- b.hi .Ltail
+ b.hi L(tail)
-.Lmasklast:
+L(masklast):
/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
add tmp, cntrem, soff
and tmp, tmp, #31
@@ -127,7 +121,7 @@ def_fn __memchr_aarch64
lsl synd, synd, tmp
lsr synd, synd, tmp
-.Ltail:
+L(tail):
/* Count the trailing zeros using bit reversing */
rbit synd, synd
/* Compensate the last post-increment */
@@ -142,8 +136,8 @@ def_fn __memchr_aarch64
csel result, xzr, result, eq
ret
-.Lzero_length:
+L(zero_length):
mov result, #0
ret
- .size __memchr_aarch64, . - __memchr_aarch64
+END (__memchr_aarch64)
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 72a66bc..6722516 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -9,7 +9,7 @@
* ARMv8-a, AArch64, unaligned accesses.
*/
-#define L(l) .L ## l
+#include "../asmdefs.h"
/* Parameters and result. */
#define src1 x0
@@ -27,15 +27,7 @@
#define tmp1 x7
#define tmp2 x8
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn __memcmp_aarch64 p2align=6
+ENTRY (__memcmp_aarch64)
subs limit, limit, 8
b.lo L(less8)
@@ -138,4 +130,4 @@ L(byte_loop):
sub result, data1w, data2w
ret
- .size __memcmp_aarch64, . - __memcmp_aarch64
+END (__memcmp_aarch64)
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index aef22e9..3868141 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -11,6 +11,7 @@
*
*/
+#include "../asmdefs.h"
#define dstin x0
#define val x1
@@ -25,17 +26,7 @@
#define zva_len x7
#define zva_lenw w7
-#define L(l) .L ## l
-
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn __memset_aarch64 p2align=6
+ENTRY (__memset_aarch64)
dup v0.16B, valw
add dstend, dstin, count
@@ -185,4 +176,4 @@ L(zva_other):
4: add count, count, zva_len
b L(tail64)
- .size __memset_aarch64, . - __memset_aarch64
+END (__memset_aarch64)
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index cdb38aa..00d9be3 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -11,6 +11,8 @@
* Neon Available.
*/
+#include "../asmdefs.h"
+
/* Arguments and results. */
#define srcin x0
#define chrin w1
@@ -48,15 +50,7 @@
/* Locals and temporaries. */
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn __strchr_aarch64
+ENTRY (__strchr_aarch64)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the requested byte. Magic constant 0x80200802 used
similarly for NUL termination. */
@@ -67,7 +61,7 @@ def_fn __strchr_aarch64
dup vrepmask_c.4s, wtmp2
ands tmp1, srcin, #31
add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq .Lloop
+ b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
@@ -93,9 +87,9 @@ def_fn __strchr_aarch64
mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
- cbnz tmp1, .Ltail
+ cbnz tmp1, L(tail)
-.Lloop:
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
@@ -107,7 +101,7 @@ def_fn __strchr_aarch64
orr vend1.16b, vend1.16b, vend2.16b
addp vend1.2d, vend1.2d, vend1.2d
mov tmp1, vend1.d[0]
- cbz tmp1, .Lloop
+ cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
@@ -121,7 +115,7 @@ def_fn __strchr_aarch64
addp vend1.16b, vend1.16b, vend2.16b // 128->64
mov tmp1, vend1.d[0]
-.Ltail:
+L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
@@ -134,4 +128,4 @@ def_fn __strchr_aarch64
csel result, result, xzr, eq
ret
- .size __strchr_aarch64, . - __strchr_aarch64
+END (__strchr_aarch64)
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 4aee293..81264ea 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -11,6 +11,8 @@
* Neon Available.
*/
+#include "../asmdefs.h"
+
/* Arguments and results. */
#define srcin x0
#define chrin w1
@@ -44,15 +46,7 @@
/* Locals and temporaries. */
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn __strchrnul_aarch64
+ENTRY (__strchrnul_aarch64)
/* Magic constant 0x40100401 to allow us to identify which lane
matches the termination condition. */
mov wtmp2, #0x0401
@@ -61,7 +55,7 @@ def_fn __strchrnul_aarch64
bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
dup vrepmask.4s, wtmp2
ands tmp1, srcin, #31
- b.eq .Lloop
+ b.eq L(loop)
/* Input string is not 32-byte aligned. Rather than forcing
the padding bytes to a safe value, we calculate the syndrome
@@ -85,9 +79,9 @@ def_fn __strchrnul_aarch64
mov tmp3, vend1.d[0]
bic tmp1, tmp3, tmp1 // Mask padding bits.
- cbnz tmp1, .Ltail
+ cbnz tmp1, L(tail)
-.Lloop:
+L(loop):
ld1 {vdata1.16b, vdata2.16b}, [src], #32
cmeq vhas_nul1.16b, vdata1.16b, #0
cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
@@ -99,7 +93,7 @@ def_fn __strchrnul_aarch64
orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b
addp vend1.2d, vend1.2d, vend1.2d
mov tmp1, vend1.d[0]
- cbz tmp1, .Lloop
+ cbz tmp1, L(loop)
/* Termination condition found. Now need to establish exactly why
we terminated. */
@@ -109,7 +103,7 @@ def_fn __strchrnul_aarch64
addp vend1.16b, vend1.16b, vend1.16b // 128->64
mov tmp1, vend1.d[0]
-.Ltail:
+L(tail):
/* Count the trailing zeros, by bit reversing... */
rbit tmp1, tmp1
/* Re-bias source. */
@@ -119,4 +113,4 @@ def_fn __strchrnul_aarch64
add result, src, tmp1, lsr #1
ret
- .size __strchrnul_aarch64, . - __strchrnul_aarch64
+END (__strchrnul_aarch64)
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 2aa367c..65af5ce 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -10,15 +10,7 @@
* ARMv8-a, AArch64
*/
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-#define L(label) .L ## label
+#include "../asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
@@ -44,7 +36,7 @@
#define pos x11
/* Start of performance-critical section -- one 64B cache line. */
-def_fn __strcmp_aarch64 p2align=6
+ENTRY (__strcmp_aarch64)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
@@ -174,4 +166,5 @@ L(loop_misaligned):
L(done):
sub result, data1, data2
ret
- .size __strcmp_aarch64, .-__strcmp_aarch64
+
+END (__strcmp_aarch64)
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 4e10b4d..4edffcf 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -10,6 +10,8 @@
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
+#include "../asmdefs.h"
+
/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
To test the page crossing code path more thoroughly, compile with
@@ -46,14 +48,6 @@
#define STRCPY __strcpy_aarch64
#endif
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
@@ -85,7 +79,7 @@
#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
-def_fn STRCPY p2align=6
+ENTRY (STRCPY)
/* For moderately short strings, the fastest way to do the copy is to
calculate the length of the string in the same way as strlen, then
essentially do a memcpy of the result. This avoids the need for
@@ -105,9 +99,9 @@ def_fn STRCPY p2align=6
srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte
aligned string will never fail the page align check, so will
always take the fast path. */
- b.gt .Lpage_cross
+ b.gt L(page_cross)
-.Lpage_cross_ok:
+L(page_cross_ok):
ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
/* Because we expect the end to be found within 16 characters
@@ -119,7 +113,7 @@ def_fn STRCPY p2align=6
sub tmp1, tmp2, zeroones
orr tmp2, tmp2, #REP8_7f
bics has_nul1, tmp1, tmp2
- b.ne .Lfp_le8
+ b.ne L(fp_le8)
rev tmp4, data2
sub tmp3, tmp4, zeroones
orr tmp4, tmp4, #REP8_7f
@@ -127,17 +121,17 @@ def_fn STRCPY p2align=6
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
bics has_nul1, tmp1, tmp2
- b.ne .Lfp_le8
+ b.ne L(fp_le8)
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
#endif
bics has_nul2, tmp3, tmp4
- b.eq .Lbulk_entry
+ b.eq L(bulk_entry)
/* The string is short (<=16 bytes). We don't know exactly how
short though, yet. Work out the exact length so that we can
quickly select the optimal copy strategy. */
-.Lfp_gt8:
+L(fp_gt8):
rev has_nul2, has_nul2
clz pos, has_nul2
mov tmp2, #56
@@ -155,12 +149,12 @@ def_fn STRCPY p2align=6
#endif
ret
-.Lfp_le8:
+L(fp_le8):
rev has_nul1, has_nul1
clz pos, has_nul1
add dst, dstin, pos, lsr #3 /* Bits to bytes. */
subs tmp2, pos, #24 /* Pos in bits. */
- b.lt .Lfp_lt4
+ b.lt L(fp_lt4)
#ifdef __AARCH64EB__
mov tmp2, #56
sub pos, tmp2, pos
@@ -176,15 +170,15 @@ def_fn STRCPY p2align=6
mov dstin, dst
#endif
ret
-.Lfp_lt4:
- cbz pos, .Lfp_lt2
+L(fp_lt4):
+ cbz pos, L(fp_lt2)
/* 2->3 bytes to copy. */
#ifdef __AARCH64EB__
lsr data1, data1, #48
#endif
strh data1w, [dstin]
/* Fall-through, one byte (max) to go. */
-.Lfp_lt2:
+L(fp_lt2):
/* Null-terminated string. Last character must be zero! */
strb wzr, [dst]
#ifdef BUILD_STPCPY
@@ -195,20 +189,20 @@ def_fn STRCPY p2align=6
.p2align 6
/* Aligning here ensures that the entry code and main loop all lies
within one 64-byte cache line. */
-.Lbulk_entry:
+L(bulk_entry):
sub to_align, to_align, #16
stp data1, data2, [dstin]
sub src, srcin, to_align
sub dst, dstin, to_align
- b .Lentry_no_page_cross
+ b L(entry_no_page_cross)
/* The inner loop deals with two Dwords at a time. This has a
slightly higher start-up cost, but we should win quite quickly,
especially on cores with a high number of issue slots per
cycle, as we get much better parallelism out of the operations. */
-.Lmain_loop:
+L(main_loop):
stp data1, data2, [dst], #16
-.Lentry_no_page_cross:
+L(entry_no_page_cross):
ldp data1, data2, [src], #16
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
@@ -217,7 +211,7 @@ def_fn STRCPY p2align=6
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lmain_loop
+ b.eq L(main_loop)
/* Since we know we are copying at least 16 bytes, the fastest way
to deal with the tail is to determine the location of the
@@ -250,7 +244,7 @@ def_fn STRCPY p2align=6
#endif
ret
-.Lpage_cross:
+L(page_cross):
bic src, srcin, #15
/* Start by loading two words at [srcin & ~15], then forcing the
bytes that precede srcin to 0xff. This means they never look
@@ -276,7 +270,7 @@ def_fn STRCPY p2align=6
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
- b.eq .Lpage_cross_ok
+ b.eq L(page_cross_ok)
/* We now need to make data1 and data2 look like they've been
loaded directly from srcin. Do a rotate on the 128-bit value. */
lsl tmp1, to_align, #3 /* Bytes->bits. */
@@ -307,8 +301,8 @@ def_fn STRCPY p2align=6
orr tmp4, data2, #REP8_7f
#endif
bic has_nul1, tmp1, tmp2
- cbnz has_nul1, .Lfp_le8
+ cbnz has_nul1, L(fp_le8)
bic has_nul2, tmp3, tmp4
- b .Lfp_gt8
+ b L(fp_gt8)
- .size STRCPY, . - STRCPY
+END (STRCPY)
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 26388d7..2293f73 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -10,6 +10,8 @@
* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
*/
+#include "../asmdefs.h"
+
/* To test the page crossing code path more thoroughly, compile with
-DTEST_PAGE_CROSS - this will force all calls through the slower
entry path. This option is not intended for production use. */
@@ -30,16 +32,6 @@
#define tmp4 x7
#define zeroones x8
-#define L(l) .L ## l
-
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. A faster check
@@ -81,7 +73,7 @@
whether the first fetch, which may be misaligned, crosses a page
boundary. */
-def_fn __strlen_aarch64 p2align=6
+ENTRY (__strlen_aarch64)
and tmp1, srcin, MIN_PAGE_SIZE - 1
mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16
@@ -122,7 +114,7 @@ L(main_loop_entry):
sub src, src, 16
L(main_loop):
ldp data1, data2, [src, 32]!
-.Lpage_cross_entry:
+L(page_cross_entry):
sub tmp1, data1, zeroones
sub tmp3, data2, zeroones
orr tmp2, tmp1, tmp3
@@ -211,4 +203,4 @@ L(page_cross):
csel data2, data2, tmp2, eq
b L(page_cross_entry)
- .size __strlen_aarch64, . - __strlen_aarch64
+END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index ced72b9..fbd08ee 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -10,13 +10,7 @@
* ARMv8-a, AArch64
*/
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
+#include "../asmdefs.h"
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
@@ -51,14 +45,14 @@
.rep 7
nop /* Pad so that the loop below fits a cache line. */
.endr
-def_fn __strncmp_aarch64
- cbz limit, .Lret0
+ENTRY_ALIGN (__strncmp_aarch64, 0)
+ cbz limit, L(ret0)
eor tmp1, src1, src2
mov zeroones, #REP8_01
tst tmp1, #7
and count, src1, #7
- b.ne .Lmisaligned8
- cbnz count, .Lmutual_align
+ b.ne L(misaligned8)
+ cbnz count, L(mutual_align)
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
@@ -67,10 +61,10 @@ def_fn __strncmp_aarch64
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
can be done in parallel across the entire word. */
/* Start of performance-critical section -- one 64B cache line. */
-.Lloop_aligned:
+L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
subs limit_wd, limit_wd, #1
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
@@ -78,15 +72,15 @@ def_fn __strncmp_aarch64
csinv endloop, diff, xzr, pl /* Last Dword or differences. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp endloop, #0, #0, eq
- b.eq .Lloop_aligned
+ b.eq L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
/* Not reached the limit, must have found the end or a diff. */
- tbz limit_wd, #63, .Lnot_limit
+ tbz limit_wd, #63, L(not_limit)
/* Limit % 8 == 0 => all bytes significant. */
ands limit, limit, #7
- b.eq .Lnot_limit
+ b.eq L(not_limit)
lsl limit, limit, #3 /* Bits -> bytes. */
mov mask, #~0
@@ -101,7 +95,7 @@ def_fn __strncmp_aarch64
/* Make sure that the NUL byte is marked in the syndrome. */
orr has_nul, has_nul, mask
-.Lnot_limit:
+L(not_limit):
orr syndrome, diff, has_nul
#ifndef __AARCH64EB__
@@ -154,7 +148,7 @@ def_fn __strncmp_aarch64
ret
#endif
-.Lmutual_align:
+L(mutual_align):
/* Sources are mutually aligned, but are not currently at an
alignment boundary. Round down the addresses and then mask off
the bytes that precede the start point.
@@ -182,56 +176,56 @@ def_fn __strncmp_aarch64
orr data1, data1, tmp2
orr data2, data2, tmp2
add limit_wd, limit_wd, tmp3, lsr #3
- b .Lstart_realigned
+ b L(start_realigned)
.p2align 6
/* Don't bother with dwords for up to 16 bytes. */
-.Lmisaligned8:
+L(misaligned8):
cmp limit, #16
- b.hs .Ltry_misaligned_words
+ b.hs L(try_misaligned_words)
-.Lbyte_loop:
+L(byte_loop):
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq .Lbyte_loop
-.Ldone:
+ b.eq L(byte_loop)
+L(done):
sub result, data1, data2
ret
/* Align the SRC1 to a dword by doing a bytewise compare and then do
the dword loop. */
-.Ltry_misaligned_words:
+L(try_misaligned_words):
lsr limit_wd, limit, #3
- cbz count, .Ldo_misaligned
+ cbz count, L(do_misaligned)
neg count, count
and count, count, #7
sub limit, limit, count
lsr limit_wd, limit, #3
-.Lpage_end_loop:
+L(page_end_loop):
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
cmp data1w, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.ne .Ldone
+ b.ne L(done)
subs count, count, #1
- b.hi .Lpage_end_loop
+ b.hi L(page_end_loop)
-.Ldo_misaligned:
+L(do_misaligned):
/* Prepare ourselves for the next page crossing. Unlike the aligned
loop, we fetch 1 less dword because we risk crossing bounds on
SRC2. */
mov count, #8
subs limit_wd, limit_wd, #1
- b.lo .Ldone_loop
-.Lloop_misaligned:
+ b.lo L(done_loop)
+L(loop_misaligned):
and tmp2, src2, #0xff8
eor tmp2, tmp2, #0xff8
- cbz tmp2, .Lpage_end_loop
+ cbz tmp2, L(page_end_loop)
ldr data1, [src1], #8
ldr data2, [src2], #8
@@ -240,14 +234,14 @@ def_fn __strncmp_aarch64
eor diff, data1, data2 /* Non-zero if differences found. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp diff, #0, #0, eq
- b.ne .Lnot_limit
+ b.ne L(not_limit)
subs limit_wd, limit_wd, #1
- b.pl .Lloop_misaligned
+ b.pl L(loop_misaligned)
-.Ldone_loop:
+L(done_loop):
/* We found a difference or a NULL before the limit was reached. */
and limit, limit, #7
- cbz limit, .Lnot_limit
+ cbz limit, L(not_limit)
/* Read the last word. */
sub src1, src1, 8
sub src2, src2, 8
@@ -258,9 +252,10 @@ def_fn __strncmp_aarch64
eor diff, data1, data2 /* Non-zero if differences found. */
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
ccmp diff, #0, #0, eq
- b.ne .Lnot_limit
+ b.ne L(not_limit)
-.Lret0:
+L(ret0):
mov result, #0
ret
- .size __strncmp_aarch64, . - __strncmp_aarch64
+
+END ( __strncmp_aarch64)
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index b02c846..df66b60 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -10,6 +10,8 @@
* ARMv8-a, AArch64
*/
+#include "../asmdefs.h"
+
/* Arguments and results. */
#define srcin x0
#define len x0
@@ -30,36 +32,28 @@
#define pos x13
#define limit_wd x14
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
.text
.p2align 6
-.Lstart:
+L(start):
/* Pre-pad to ensure critical loop begins an icache line. */
.rep 7
nop
.endr
/* Put this code here to avoid wasting more space with pre-padding. */
-.Lhit_limit:
+L(hit_limit):
mov len, limit
ret
-def_fn __strnlen_aarch64
- cbz limit, .Lhit_limit
+ENTRY_ALIGN (__strnlen_aarch64, 0)
+ cbz limit, L(hit_limit)
mov zeroones, #REP8_01
bic src, srcin, #15
ands tmp1, srcin, #15
- b.ne .Lmisaligned
+ b.ne L(misaligned)
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
@@ -73,9 +67,9 @@ def_fn __strnlen_aarch64
cycle, as we get much better parallelism out of the operations. */
/* Start of critial section -- keep to one 64Byte cache line. */
-.Lloop:
+L(loop):
ldp data1, data2, [src], #16
-.Lrealigned:
+L(realigned):
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
@@ -85,24 +79,24 @@ def_fn __strnlen_aarch64
subs limit_wd, limit_wd, #1
orr tmp1, has_nul1, has_nul2
ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
- b.eq .Lloop
+ b.eq L(loop)
/* End of critical section -- keep to one 64Byte cache line. */
orr tmp1, has_nul1, has_nul2
- cbz tmp1, .Lhit_limit /* No null in final Qword. */
+ cbz tmp1, L(hit_limit) /* No null in final Qword. */
/* We know there's a null in the final Qword. The easiest thing
to do now is work out the length of the string and return
MIN (len, limit). */
sub len, src, srcin
- cbz has_nul1, .Lnul_in_data2
+ cbz has_nul1, L(nul_in_data2)
#ifdef __AARCH64EB__
mov data2, data1
#endif
sub len, len, #8
mov has_nul2, has_nul1
-.Lnul_in_data2:
+L(nul_in_data2):
#ifdef __AARCH64EB__
/* For big-endian, carry propagation (if the final byte in the
string is 0x01) means we cannot use has_nul directly. The
@@ -121,7 +115,7 @@ def_fn __strnlen_aarch64
csel len, len, limit, ls /* Return the lower value. */
ret
-.Lmisaligned:
+L(misaligned):
/* Deal with a partial first word.
We're doing two things in parallel here;
1) Calculate the number of words (but avoiding overflow if
@@ -156,5 +150,6 @@ def_fn __strnlen_aarch64
csinv data1, data1, xzr, le
csel data2, data2, data2a, le
- b .Lrealigned
- .size __strnlen_aarch64, . - .Lstart /* Include pre-padding in size. */
+ b L(realigned)
+
+END (__strnlen_aarch64)
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
new file mode 100644
index 0000000..1b4caac
--- /dev/null
+++ b/string/aarch64/strrchr.S
@@ -0,0 +1,147 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+#define src_match x6
+#define src_offset x7
+#define const_m1 x8
+#define tmp4 x9
+#define nul_match x10
+#define chr_match x11
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask_0 v7
+#define vrepmask_c v16
+#define vend1 v17
+#define vend2 v18
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character; bit 1 is set
+ iff the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL). Since the bits
+ in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination, and why. */
+
+ENTRY (__strrchr_aarch64)
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x80200802 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask_c.4s, wtmp2
+ mov src_offset, #0
+ ands tmp1, srcin, #31
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq L(aligned)
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vhas_nul1.d[0]
+ lsl tmp1, tmp1, #1
+ mov const_m1, #~0
+ mov chr_match, vhas_chr1.d[0]
+ lsr tmp3, const_m1, tmp1
+
+ bic nul_match, nul_match, tmp3 // Mask padding bits.
+ bic chr_match, chr_match, tmp3 // Mask padding bits.
+ cbnz nul_match, L(tail)
+
+L(loop):
+ cmp chr_match, #0
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+L(aligned):
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend1.16b // 128->64
+ addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
+ mov nul_match, vend1.d[0]
+ mov chr_match, vhas_chr1.d[0]
+ cbz nul_match, L(loop)
+
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+ addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+ mov nul_match, vhas_nul1.d[0]
+
+L(tail):
+ /* Work out exactly where the string ends. */
+ sub tmp4, nul_match, #1
+ eor tmp4, tmp4, nul_match
+ ands chr_match, chr_match, tmp4
+ /* And pick the values corresponding to the last match. */
+ csel src_match, src, src_match, ne
+ csel src_offset, chr_match, src_offset, ne
+
+ /* Count down from the top of the syndrome to find the last match. */
+ clz tmp3, src_offset
+ /* Src_match points beyond the word containing the match, so we can
+ simply subtract half the bit-offset into the syndrome. Because
+ we are counting down, we need to go back one more character. */
+ add tmp3, tmp3, #2
+ sub result, src_match, tmp3, lsr #1
+ /* But if the syndrome shows no match was found, then return NULL. */
+ cmp src_offset, #0
+ csel result, result, xzr, ne
+
+ ret
+
+END (__strrchr_aarch64)
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index 3346e4f..aab78a2 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -17,6 +17,8 @@
*/
+#include "../asmdefs.h"
+
.syntax unified
/* This implementation requires ARM state. */
.arm
@@ -118,23 +120,15 @@
.endm
#endif
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
-def_fn __memcpy_arm p2align=6
+ENTRY (__memcpy_arm)
mov dst, dstin /* Preserve dstin, we need to return it. */
cmp count, #64
- bge .Lcpy_not_short
+ bge L(cpy_not_short)
/* Deal with small copies quickly by dropping straight into the
exit block. */
-.Ltail63unaligned:
+L(tail63unaligned):
#ifdef USE_NEON
and tmp1, count, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
@@ -213,13 +207,13 @@ def_fn __memcpy_arm p2align=6
strbne src, [dst]
bx lr
-.Lcpy_not_short:
+L(cpy_not_short):
/* At least 64 bytes to copy, but don't know the alignment yet. */
str tmp2, [sp, #-FRAME_SIZE]!
and tmp2, src, #7
and tmp1, dst, #7
cmp tmp1, tmp2
- bne .Lcpy_notaligned
+ bne L(cpy_notaligned)
#ifdef USE_VFP
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
@@ -245,12 +239,12 @@ def_fn __memcpy_arm p2align=6
1:
subs tmp2, count, #64 /* Use tmp2 for count. */
- blt .Ltail63aligned
+ blt L(tail63aligned)
cmp tmp2, #512
- bge .Lcpy_body_long
+ bge L(cpy_body_long)
-.Lcpy_body_medium: /* Count in tmp2. */
+L(cpy_body_medium): /* Count in tmp2. */
#ifdef USE_VFP
1:
vldr d0, [src, #0]
@@ -274,9 +268,9 @@ def_fn __memcpy_arm p2align=6
add dst, dst, #64
bge 1b
tst tmp2, #0x3f
- beq .Ldone
+ beq L(done)
-.Ltail63aligned: /* Count in tmp2. */
+L(tail63aligned): /* Count in tmp2. */
and tmp1, tmp2, #0x38
add dst, dst, tmp1
add src, src, tmp1
@@ -327,7 +321,7 @@ def_fn __memcpy_arm p2align=6
add src, src, #8
add dst, dst, #8
-.Ltail63aligned: /* Count in tmp2. */
+L(tail63aligned): /* Count in tmp2. */
/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
we know that the src and dest are 64-bit aligned so we can use
LDRD/STRD to improve efficiency. */
@@ -364,11 +358,11 @@ def_fn __memcpy_arm p2align=6
strhcs tmp1, [dst], #2
strbne tmp2, [dst]
-.Ldone:
+L(done):
ldr tmp2, [sp], #FRAME_SIZE
bx lr
-.Lcpy_body_long: /* Count in tmp2. */
+L(cpy_body_long): /* Count in tmp2. */
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
@@ -425,7 +419,7 @@ def_fn __memcpy_arm p2align=6
vstr d2, [dst, #64 + 56]
add dst, dst, #128
add tmp2, tmp2, #prefetch_lines * 64
- b .Lcpy_body_medium
+ b L(cpy_body_medium)
#else
/* Long copy. Use an SMS style loop to maximize the I/O
bandwidth of the core. We don't have enough spare registers
@@ -479,12 +473,12 @@ def_fn __memcpy_arm p2align=6
ldrd D_l, D_h, [sp, #24]
add dst, dst, #72
tst tmp2, #0x3f
- bne .Ltail63aligned
+ bne L(tail63aligned)
ldr tmp2, [sp], #FRAME_SIZE
bx lr
#endif
-.Lcpy_notaligned:
+L(cpy_notaligned):
pld [src]
pld [src, #64]
/* There's at least 64 bytes to copy, but there is no mutual
@@ -506,7 +500,7 @@ def_fn __memcpy_arm p2align=6
pld [src, #(3 * 64)]
subs count, count, #64
ldrmi tmp2, [sp], #FRAME_SIZE
- bmi .Ltail63unaligned
+ bmi L(tail63unaligned)
pld [src, #(4 * 64)]
#ifdef USE_NEON
@@ -587,7 +581,7 @@ def_fn __memcpy_arm p2align=6
ands count, tmp2, #0x3f
#endif
ldr tmp2, [sp], #FRAME_SIZE
- bne .Ltail63unaligned
+ bne L(tail63unaligned)
bx lr
- .size __memcpy_arm, . - __memcpy_arm
+END (__memcpy_arm)
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index 5ea06c9..d615231 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -26,12 +26,7 @@
DoSub \n, \label
.endm
- .text
- .p2align 0
- .global __strcmp_armv6m
- .type __strcmp_armv6m, %function
-__strcmp_armv6m:
- .cfi_startproc
+ENTRY_ALIGN (__strcmp_armv6m, 4)
mov r2, r0
push {r4, r5, r6, lr}
orrs r2, r1
@@ -114,5 +109,5 @@ __strcmp_armv6m:
7:
subs r0, r2, r3
pop {r4, r5, r6, pc}
- .cfi_endproc
- .size __strcmp_armv6m, . - __strcmp_armv6m
+
+END (__strcmp_armv6m)
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index fb9cae3..295db8b 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -10,6 +10,8 @@
is sufficiently aligned. Use saturating arithmetic to optimize
the compares. */
+#include "../asmdefs.h"
+
/* Build Options:
STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
byte in the string. If comparing completely random strings
@@ -48,14 +50,6 @@
#define LSB 0x000000ff
#endif /* not __ARM_BIG_ENDIAN */
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
-
/* Parameters and result. */
#define src1 r0
#define src2 r1
@@ -131,23 +125,22 @@
.text
.p2align 5
-.Lstrcmp_start_addr:
+L(strcmp_start_addr):
#if STRCMP_NO_PRECHECK == 0
-.Lfastpath_exit:
+L(fastpath_exit):
sub r0, r2, r3
bx lr
nop
#endif
-def_fn __strcmp_arm
+ENTRY_ALIGN (__strcmp_arm, 0)
#if STRCMP_NO_PRECHECK == 0
ldrb r2, [src1]
ldrb r3, [src2]
cmp r2, #1
it cs
cmpcs r2, r3
- bne .Lfastpath_exit
+ bne L(fastpath_exit)
#endif
- .cfi_startproc
strd r4, r5, [sp, #-16]!
.cfi_def_cfa_offset 16
.cfi_offset 4, -16
@@ -158,12 +151,12 @@ def_fn __strcmp_arm
.cfi_offset 7, -4
mvn const_m1, #0
lsl r2, tmp1, #29
- cbz r2, .Lloop_aligned8
+ cbz r2, L(loop_aligned8)
-.Lnot_aligned:
+L(not_aligned):
eor tmp1, src1, src2
tst tmp1, #7
- bne .Lmisaligned8
+ bne L(misaligned8)
/* Deal with mutual misalignment by aligning downwards and then
masking off the unwanted loaded data to prevent a difference. */
@@ -180,29 +173,29 @@ def_fn __strcmp_arm
S2HI tmp1, const_m1, tmp2
orn data1a, data1a, tmp1
orn data2a, data2a, tmp1
- beq .Lstart_realigned8
+ beq L(start_realigned8)
orn data1b, data1b, tmp1
mov data1a, const_m1
orn data2b, data2b, tmp1
mov data2a, const_m1
- b .Lstart_realigned8
+ b L(start_realigned8)
/* Unwind the inner loop by a factor of 2, giving 16 bytes per
pass. */
.p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
.p2align 2 /* Always word aligned. */
-.Lloop_aligned8:
+L(loop_aligned8):
ldrd data1a, data1b, [src1], #16
ldrd data2a, data2b, [src2], #16
-.Lstart_realigned8:
+L(start_realigned8):
uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
eor syndrome_a, data1a, data2a
sel syndrome_a, syndrome_a, const_m1
- cbnz syndrome_a, .Ldiff_in_a
+ cbnz syndrome_a, L(diff_in_a)
uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
eor syndrome_b, data1b, data2b
sel syndrome_b, syndrome_b, const_m1
- cbnz syndrome_b, .Ldiff_in_b
+ cbnz syndrome_b, L(diff_in_b)
ldrd data1a, data1b, [src1, #-8]
ldrd data2a, data2b, [src2, #-8]
@@ -214,47 +207,47 @@ def_fn __strcmp_arm
sel syndrome_b, syndrome_b, const_m1
/* Can't use CBZ for backwards branch. */
orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
- beq .Lloop_aligned8
+ beq L(loop_aligned8)
-.Ldiff_found:
- cbnz syndrome_a, .Ldiff_in_a
+L(diff_found):
+ cbnz syndrome_a, L(diff_in_a)
-.Ldiff_in_b:
+L(diff_in_b):
strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
-.Ldiff_in_a:
+L(diff_in_a):
.cfi_restore_state
strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
.cfi_restore_state
-.Lmisaligned8:
+L(misaligned8):
tst tmp1, #3
- bne .Lmisaligned4
+ bne L(misaligned4)
ands tmp1, src1, #3
- bne .Lmutual_align4
+ bne L(mutual_align4)
/* Unrolled by a factor of 2, to reduce the number of post-increment
operations. */
-.Lloop_aligned4:
+L(loop_aligned4):
ldr data1, [src1], #8
ldr data2, [src2], #8
-.Lstart_realigned4:
+L(start_realigned4):
uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
- cbnz syndrome, .Laligned4_done
+ cbnz syndrome, L(aligned4_done)
ldr data1, [src1, #-4]
ldr data2, [src2, #-4]
uadd8 syndrome, data1, const_m1
eor syndrome, data1, data2
sel syndrome, syndrome, const_m1
cmp syndrome, #0
- beq .Lloop_aligned4
+ beq L(loop_aligned4)
-.Laligned4_done:
+L(aligned4_done):
strcmp_epilogue_aligned syndrome, data1, data2, 0
-.Lmutual_align4:
+L(mutual_align4):
.cfi_restore_state
/* Deal with mutual misalignment by aligning downwards and then
masking off the unwanted loaded data to prevent a difference. */
@@ -269,57 +262,57 @@ def_fn __strcmp_arm
S2HI tmp1, const_m1, tmp1
orn data1, data1, tmp1
orn data2, data2, tmp1
- b .Lstart_realigned4
+ b L(start_realigned4)
-.Lmisaligned4:
+L(misaligned4):
ands tmp1, src1, #3
- beq .Lsrc1_aligned
+ beq L(src1_aligned)
sub src2, src2, tmp1
bic src1, src1, #3
lsls tmp1, tmp1, #31
ldr data1, [src1], #4
- beq .Laligned_m2
- bcs .Laligned_m1
+ beq L(aligned_m2)
+ bcs L(aligned_m1)
#if STRCMP_NO_PRECHECK == 1
ldrb data2, [src2, #1]
uxtb tmp1, data1, ror #BYTE1_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
-.Laligned_m2:
+L(aligned_m2):
ldrb data2, [src2, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
-.Laligned_m1:
+L(aligned_m1):
ldrb data2, [src2, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
+ bne L(misaligned_exit)
add src2, src2, #4
- cbnz data2, .Lsrc1_aligned
+ cbnz data2, L(src1_aligned)
#else /* STRCMP_NO_PRECHECK */
/* If we've done the pre-check, then we don't need to check the
first byte again here. */
ldrb data2, [src2, #2]
uxtb tmp1, data1, ror #BYTE2_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbz data2, .Lmisaligned_exit
+ bne L(misaligned_exit)
+ cbz data2, L(misaligned_exit)
-.Laligned_m2:
+L(aligned_m2):
ldrb data2, [src2, #3]
uxtb tmp1, data1, ror #BYTE3_OFFSET
subs tmp1, tmp1, data2
- bne .Lmisaligned_exit
- cbnz data2, .Laligned_m1
+ bne L(misaligned_exit)
+ cbnz data2, L(aligned_m1)
#endif
-.Lmisaligned_exit:
+L(misaligned_exit):
.cfi_remember_state
mov result, tmp1
ldr r4, [sp], #16
@@ -327,10 +320,10 @@ def_fn __strcmp_arm
bx lr
#if STRCMP_NO_PRECHECK == 0
-.Laligned_m1:
+L(aligned_m1):
add src2, src2, #4
#endif
-.Lsrc1_aligned:
+L(src1_aligned):
.cfi_restore_state
/* src1 is word aligned, but src2 has no common alignment
with it. */
@@ -339,11 +332,11 @@ def_fn __strcmp_arm
bic src2, src2, #3
ldr data2, [src2], #4
- bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
- bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
+ bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */
+ bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */
/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
-.Loverlap3:
+L(overlap3):
bic tmp1, data1, #MSB
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #8
@@ -355,14 +348,14 @@ def_fn __strcmp_arm
cmp tmp1, data2, S2HI #24
bne 6f
ldr data1, [src1], #4
- b .Loverlap3
+ b L(overlap3)
4:
S2LO data2, data2, #8
- b .Lstrcmp_tail
+ b L(strcmp_tail)
5:
bics syndrome, syndrome, #MSB
- bne .Lstrcmp_done_equal
+ bne L(strcmp_done_equal)
/* We can only get here if the MSB of data1 contains 0, so
fast-path the exit. */
@@ -381,10 +374,10 @@ def_fn __strcmp_arm
.cfi_restore_state
S2LO data1, data1, #24
and data2, data2, #LSB
- b .Lstrcmp_tail
+ b L(strcmp_tail)
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
-.Loverlap2:
+L(overlap2):
and tmp1, data1, const_m1, S2LO #16
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #16
@@ -396,28 +389,28 @@ def_fn __strcmp_arm
cmp tmp1, data2, S2HI #16
bne 6f
ldr data1, [src1], #4
- b .Loverlap2
+ b L(overlap2)
4:
S2LO data2, data2, #16
- b .Lstrcmp_tail
+ b L(strcmp_tail)
5:
ands syndrome, syndrome, const_m1, S2LO #16
- bne .Lstrcmp_done_equal
+ bne L(strcmp_done_equal)
ldrh data2, [src2]
S2LO data1, data1, #16
#ifdef __ARM_BIG_ENDIAN
lsl data2, data2, #16
#endif
- b .Lstrcmp_tail
+ b L(strcmp_tail)
6:
S2LO data1, data1, #16
and data2, data2, const_m1, S2LO #16
- b .Lstrcmp_tail
+ b L(strcmp_tail)
.p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
-.Loverlap1:
+L(overlap1):
and tmp1, data1, #LSB
uadd8 syndrome, data1, const_m1
eors syndrome, tmp1, data2, S2LO #24
@@ -429,20 +422,20 @@ def_fn __strcmp_arm
cmp tmp1, data2, S2HI #8
bne 6f
ldr data1, [src1], #4
- b .Loverlap1
+ b L(overlap1)
4:
S2LO data2, data2, #24
- b .Lstrcmp_tail
+ b L(strcmp_tail)
5:
tst syndrome, #LSB
- bne .Lstrcmp_done_equal
+ bne L(strcmp_done_equal)
ldr data2, [src2]
6:
S2LO data1, data1, #8
bic data2, data2, #MSB
- b .Lstrcmp_tail
+ b L(strcmp_tail)
-.Lstrcmp_done_equal:
+L(strcmp_done_equal):
mov result, #0
.cfi_remember_state
ldrd r4, r5, [sp], #16
@@ -453,7 +446,7 @@ def_fn __strcmp_arm
.cfi_restore 7
bx lr
-.Lstrcmp_tail:
+L(strcmp_tail):
.cfi_restore_state
#ifndef __ARM_BIG_ENDIAN
rev data1, data1
@@ -475,5 +468,5 @@ def_fn __strcmp_arm
.cfi_restore 7
sub result, result, data2, lsr #24
bx lr
- .cfi_endproc
- .size __strcmp, . - .Lstrcmp_start_addr
+
+END (__strcmp_arm)
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 279ec87..76e6930 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -11,13 +11,7 @@
*/
- .macro def_fn f p2align=0
- .text
- .p2align \p2align
- .global \f
- .type \f, %function
-\f:
- .endm
+#include "../asmdefs.h"
#ifdef __ARMEB__
#define S2LO lsl
@@ -44,27 +38,27 @@
#define tmp1 r4 /* Overlaps const_0 */
#define tmp2 r5
-def_fn __strlen_armv6t2 p2align=6
+ENTRY (__strlen_armv6t2)
pld [srcin, #0]
strd r4, r5, [sp, #-8]!
bic src, srcin, #7
mvn const_m1, #0
ands tmp1, srcin, #7 /* (8 - bytes) to alignment. */
pld [src, #32]
- bne.w .Lmisaligned8
+ bne.w L(misaligned8)
mov const_0, #0
mov result, #-8
-.Lloop_aligned:
+L(loop_aligned):
/* Bytes 0-7. */
ldrd data1a, data1b, [src]
pld [src, #64]
add result, result, #8
-.Lstart_realigned:
+L(start_realigned):
uadd8 data1a, data1a, const_m1 /* Saturating GE<0:3> set. */
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
- cbnz data1b, .Lnull_found
+ cbnz data1b, L(null_found)
/* Bytes 8-15. */
ldrd data1a, data1b, [src, #8]
@@ -73,7 +67,7 @@ def_fn __strlen_armv6t2 p2align=6
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
- cbnz data1b, .Lnull_found
+ cbnz data1b, L(null_found)
/* Bytes 16-23. */
ldrd data1a, data1b, [src, #16]
@@ -82,7 +76,7 @@ def_fn __strlen_armv6t2 p2align=6
sel data1a, const_0, const_m1 /* Select based on GE<0:3>. */
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
- cbnz data1b, .Lnull_found
+ cbnz data1b, L(null_found)
/* Bytes 24-31. */
ldrd data1a, data1b, [src, #24]
@@ -93,9 +87,9 @@ def_fn __strlen_armv6t2 p2align=6
uadd8 data1b, data1b, const_m1
sel data1b, data1a, const_m1 /* Only used if d1a == 0. */
cmp data1b, #0
- beq .Lloop_aligned
+ beq L(loop_aligned)
-.Lnull_found:
+L(null_found):
cmp data1a, #0
itt eq
addeq result, result, #4
@@ -108,7 +102,7 @@ def_fn __strlen_armv6t2 p2align=6
add result, result, data1a, lsr #3 /* Bits -> Bytes. */
bx lr
-.Lmisaligned8:
+L(misaligned8):
ldrd data1a, data1b, [src]
and tmp2, tmp1, #3
rsb result, tmp1, #0
@@ -121,5 +115,6 @@ def_fn __strlen_armv6t2 p2align=6
ornne data1b, data1b, tmp2
movne data1a, const_m1
mov const_0, #0
- b .Lstart_realigned
- .size __strlen_armv6t2, . - __strlen_armv6t2
+ b L(start_realigned)
+
+END (__strlen_armv6t2)
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 3f60220..b3b6181 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -13,7 +13,6 @@
#endif
#if __aarch64__
-void *__memcpy_bytewise (void *__restrict, const void *__restrict, size_t);
void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
void *__memmove_aarch64 (void *, const void *, size_t);
void *__memset_aarch64 (void *, int, size_t);
@@ -22,6 +21,7 @@ int __memcmp_aarch64 (const void *, const void *, size_t);
char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
int __strcmp_aarch64 (const char *, const char *);
char *__strchr_aarch64 (const char *, int);
+char *__strrchr_aarch64 (const char *, int);
char *__strchrnul_aarch64 (const char *, int );
size_t __strlen_aarch64 (const char *);
size_t __strnlen_aarch64 (const char *, size_t);
diff --git a/string/memcpy_bytewise.S b/string/memcpy_bytewise.S
deleted file mode 100644
index 7ee3474..0000000
--- a/string/memcpy_bytewise.S
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Trivial AArch64 memcpy.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "asmdefs.h"
-
-ENTRY (__memcpy_bytewise)
- cbz x2, 2f
- mov x3, 0
-1:
- ldrb w4, [x1, x3]
- strb w4, [x0, x3]
- add x3, x3, 1
- cmp x3, x2
- bne 1b
-2:
- ret
-END (__memcpy_bytewise)
-#endif
diff --git a/string/strrchr.S b/string/strrchr.S
index 18b1cf9..119b1d5 100644
--- a/string/strrchr.S
+++ b/string/strrchr.S
@@ -6,6 +6,7 @@
*/
#if __aarch64__
+#include "aarch64/strrchr.S"
# if __ARM_FEATURE_SVE
#include "aarch64/strrchr-sve.S"
# endif
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index e31f359..8572452 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -19,7 +19,6 @@ static const struct fun
#define F(x) {#x, x},
F(memcpy)
#if __aarch64__
-F(__memcpy_bytewise)
F(__memcpy_aarch64)
# if __ARM_NEON
F(__memcpy_aarch64_simd)
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index 6277fae..532fa51 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -20,6 +20,7 @@ static const struct fun
#define F(x) {#x, x},
F(strrchr)
#if __aarch64__
+F(__strrchr_aarch64)
# if __ARM_FEATURE_SVE
F(__strrchr_aarch64_sve)
# endif