32 files changed, 598 insertions, 356 deletions
diff --git a/METADATA b/METADATA
index 73c448b..94791ae 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
     type: GIT
     value: "https://github.com/ARM-software/optimized-routines.git"
   }
-  version: "3377796fe24ff1d5396609205426402678208eb1"
+  version: "33ba19089a261964e1e84ba4edf90263b468c161"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2019
-    month: 12
-    day: 26
+    year: 2020
+    month: 2
+    day: 1
   }
 }
diff --git a/README b/README
index 76fe018..440f08a 100644
--- a/README
+++ b/README
@@ -3,8 +3,10 @@ Arm Optimized Routines
 
 This repository contains implementations of library functions
 provided by Arm under MIT License (See LICENSE). Contributions
-to this project are accepted, but the terms will need negotiation (so
-relicensing and copyright assignment to the FSF is possible later).
+to this project are accepted, but Contributors have to sign an
+Assignment Agreement, please follow the instructions in
+contributor-agreement.pdf. This is needed so upstreaming code
+to projects that require copyright assignment is possible.
 
 Regular quarterly releases are tagged as vYY.MM (e.g. v19.11).
 
diff --git a/contributor-agreement.pdf b/contributor-agreement.pdf
new file mode 100644
index 0000000..f42c3ae
--- /dev/null
+++ b/contributor-agreement.pdf
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index 254954a..4493008 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -36,6 +36,7 @@ double __s_sin (double);
 double __s_cos (double);
 double __s_exp (double);
 double __s_log (double);
+double __s_pow (double, double);
 
 #if __aarch64__
 #if __GNUC__ >= 5
@@ -61,6 +62,7 @@ __f64x2_t __v_sin (__f64x2_t);
 __f64x2_t __v_cos (__f64x2_t);
 __f64x2_t __v_exp (__f64x2_t);
 __f64x2_t __v_log (__f64x2_t);
+__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
 
 #if __GNUC__ >= 9 || __clang_major__ >= 8
 #define __vpcs __attribute__((__aarch64_vector_pcs__))
@@ -78,6 +80,7 @@ __vpcs __f64x2_t __vn_sin (__f64x2_t);
 __vpcs __f64x2_t __vn_cos (__f64x2_t);
 __vpcs __f64x2_t __vn_exp (__f64x2_t);
 __vpcs __f64x2_t __vn_log (__f64x2_t);
+__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
 
 /* Vector functions following the vector PCS using ABI names.  */
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
@@ -90,6 +93,7 @@ __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
 #endif
 #endif
 
diff --git a/math/pow.c b/math/pow.c
index 493488d..ced7c4f 100644
--- a/math/pow.c
+++ b/math/pow.c
@@ -347,7 +347,9 @@ pow (double x, double y)
       if (topx == 0)
 	{
 	  /* Normalize subnormal x so exponent becomes negative.  */
-	  ix = asuint64 (x * 0x1p52);
+	  /* Without the barrier some versions of clang evalutate the mul
+	     unconditionally causing spurious overflow exceptions.  */
+	  ix = asuint64 (opt_barrier_double (x) * 0x1p52);
 	  ix &= 0x7fffffffffffffff;
 	  ix -= 52ULL << 52;
 	}
diff --git a/math/s_pow.c b/math/s_pow.c
new file mode 100644
index 0000000..2e34c9f
--- /dev/null
+++ b/math/s_pow.c
@@ -0,0 +1,6 @@
+/*
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#define SCALAR 1
+#include "v_pow.c"
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 8d3ff1d..33ceda3 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -128,6 +128,18 @@ xy_Z_powf (v_float x)
 {
   return _ZGVnN4vv_powf (x, x);
 }
+
+__vpcs static v_double
+xy__vn_pow (v_double x)
+{
+  return __vn_pow (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+  return _ZGVnN2vv_pow (x, x);
+}
 #endif
 
 static v_float
@@ -135,6 +147,12 @@ xy__v_powf (v_float x)
 {
   return __v_powf (x, x);
 }
+
+static v_double
+xy__v_pow (v_double x)
+{
+  return __v_pow (x, x);
+}
 #endif
 
 static float
@@ -142,6 +160,12 @@ xy__s_powf (float x)
 {
   return __s_powf (x, x);
 }
+
+static double
+xy__s_pow (double x)
+{
+  return __s_pow (x, x);
+}
 #endif
 
 static double
@@ -256,6 +280,7 @@ D (__s_sin, -3.1, 3.1)
 D (__s_cos, -3.1, 3.1)
 D (__s_exp, -9.9, 9.9)
 D (__s_log, 0.01, 11.1)
+{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
 F (__s_expf, -9.9, 9.9)
 F (__s_expf_1u, -9.9, 9.9)
 F (__s_exp2f, -9.9, 9.9)
@@ -270,6 +295,7 @@ VD (__v_sin, -3.1, 3.1)
 VD (__v_cos, -3.1, 3.1)
 VD (__v_exp, -9.9, 9.9)
 VD (__v_log, 0.01, 11.1)
+{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
 VF (__v_dummyf, 1.0, 2.0)
 VF (__v_expf, -9.9, 9.9)
 VF (__v_expf_1u, -9.9, 9.9)
@@ -285,6 +311,8 @@ VND (__vn_exp, -9.9, 9.9)
 VND (_ZGVnN2v_exp, -9.9, 9.9)
 VND (__vn_log, 0.01, 11.1)
 VND (_ZGVnN2v_log, 0.01, 11.1)
+{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
 VND (__vn_sin, -3.1, 3.1)
 VND (_ZGVnN2v_sin, -3.1, 3.1)
 VND (__vn_cos, -3.1, 3.1)
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index 44393b8..a8c391b 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -45,6 +45,16 @@ t exp2 -0x1p-6    -0x1p6     40000
 t exp2  633.3      733.3     10000
 t exp2 -633.3     -777.3     10000
 
+L=0.02
+t log  0 0xffff000000000000 10000
+t log  0x1p-4    0x1p4      40000
+t log  0         inf        40000
+
+L=0.05
+t log2  0 0xffff000000000000 10000
+t log2  0x1p-4    0x1p4      40000
+t log2  0         inf        40000
+
 L=0.05
 t pow  0.5  2.0  x  0  inf 20000
 t pow -0.5 -2.0  x  0  inf 20000
@@ -72,6 +82,16 @@ t exp2f  0    0xffff0000   10000
 t exp2f  0x1p-14   0x1p8   50000
 t exp2f -0x1p-14  -0x1p8   50000
 
+L=0.32
+t logf  0    0xffff0000    10000
+t logf  0x1p-4    0x1p4    50000
+t logf  0         inf      50000
+
+L=0.26
+t log2f  0    0xffff0000   10000
+t log2f  0x1p-4    0x1p4   50000
+t log2f  0         inf     50000
+
 L=0.06
 t sinf  0    0xffff0000    10000
 t sinf  0x1p-14  0x1p54    50000
@@ -82,6 +102,16 @@ t cosf  0    0xffff0000    10000
 t cosf  0x1p-14  0x1p54    50000
 t cosf -0x1p-14 -0x1p54    50000
 
+L=0.06
+t sincosf_sinf  0    0xffff0000    10000
+t sincosf_sinf  0x1p-14  0x1p54    50000
+t sincosf_sinf -0x1p-14 -0x1p54    50000
+
+L=0.06
+t sincosf_cosf  0    0xffff0000    10000
+t sincosf_cosf  0x1p-14  0x1p54    50000
+t sincosf_cosf -0x1p-14 -0x1p54    50000
+
 L=0.4
 t powf  0x1p-1   0x1p1  x  0x1p-7 0x1p7   50000
 t powf  0x1p-1   0x1p1  x -0x1p-7 -0x1p7  50000
@@ -110,6 +140,28 @@ range_exp='
  -633.3     -777.3     10000
 '
 
+range_log='
+  0 0xffff000000000000 10000
+  0x1p-4     0x1p4     400000
+  0          inf       400000
+'
+
+range_pow='
+ 0x1p-1   0x1p1  x  0x1p-10 0x1p10   50000
+ 0x1p-1   0x1p1  x -0x1p-10 -0x1p10  50000
+ 0x1p-500 0x1p500  x  0x1p-1 0x1p1   50000
+ 0x1p-500 0x1p500  x  -0x1p-1 -0x1p1 50000
+ 0x1.ep-1 0x1.1p0 x  0x1p8 0x1p16    50000
+ 0x1.ep-1 0x1.1p0 x -0x1p8 -0x1p16   50000
+'
+
+range_sin='
+  0 0xffff000000000000 10000
+  0x1p-4     0x1p4     400000
+ -0x1p-23    0x1p23    400000
+'
+range_cos="$range_sin"
+
 range_expf='
   0    0xffff0000    10000
   0x1p-14   0x1p8    500000
@@ -143,6 +195,10 @@ range_powf='
 
 # error limits
 L_exp=1.9
+L_log=1.2
+L_pow=0.05
+L_sin=3.0
+L_cos=3.0
 L_expf=1.49
 L_expf_1u=0.4
 L_exp2f=1.49
@@ -173,6 +229,26 @@ exp  __v_exp       $runv
 exp  __vn_exp      $runvn
 exp  _ZGVnN2v_exp  $runvn
 
+log  __s_log       $runs
+log  __v_log       $runv
+log  __vn_log      $runvn
+log  _ZGVnN2v_log  $runvn
+
+pow __s_pow       $runs
+pow __v_pow       $runv
+pow __vn_pow      $runvn
+pow _ZGVnN2vv_pow $runvn
+
+sin __s_sin       $runs
+sin __v_sin       $runv
+sin __vn_sin      $runvn
+sin _ZGVnN2v_sin  $runvn
+
+cos __s_cos       $runs
+cos __v_cos       $runv
+cos __vn_cos      $runvn
+cos _ZGVnN2v_cos  $runvn
+
 expf __s_expf      $runs
 expf __v_expf      $runv
 expf __vn_expf     $runvn
diff --git a/math/test/ulp.c b/math/test/ulp.c
index b746080..371567a 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -214,6 +214,16 @@ struct conf
   double errlim;
 };
 
+/* Wrappers for sincos.  */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
 /* A bit of a hack: call vector functions twice with the same
    input in lane 0 but a different value in other lanes: once
    with an in-range value and then with a special case value.  */
@@ -240,6 +250,7 @@ static double v_sin(double x) { return __v_sin(argd(x))[0]; }
 static double v_cos(double x) { return __v_cos(argd(x))[0]; }
 static double v_exp(double x) { return __v_exp(argd(x))[0]; }
 static double v_log(double x) { return __v_log(argd(x))[0]; }
+static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
 #ifdef __vpcs
 static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
 static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
@@ -253,6 +264,7 @@ static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
 static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
 static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
 static double vn_log(double x) { return __vn_log(argd(x))[0]; }
+static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
 static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
 static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
 static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
@@ -263,6 +275,7 @@ static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
 static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
 static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
 static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
 #endif
 #endif
 
@@ -311,6 +324,8 @@ static const struct fun fun[] = {
 #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
  F1 (sin)
  F1 (cos)
+ F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
+ F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
  F1 (exp)
  F1 (exp2)
  F1 (log)
@@ -334,6 +349,7 @@ static const struct fun fun[] = {
  F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
  F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
  F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
+ F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
 #if __aarch64__
  F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
  F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
@@ -347,6 +363,7 @@ static const struct fun fun[] = {
  F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
  F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
  F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
 #ifdef __vpcs
  F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
  F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
@@ -360,6 +377,7 @@ static const struct fun fun[] = {
  F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
  F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
  F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
  F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
  F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
  F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
@@ -370,6 +388,7 @@ static const struct fun fun[] = {
  F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
  F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
  F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
 #endif
 #endif
 #endif
diff --git a/math/v_math.h b/math/v_math.h
index 0861e98..3db22e5 100644
--- a/math/v_math.h
+++ b/math/v_math.h
@@ -249,6 +249,11 @@ v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
   return f (x1, x2);
 }
 
+static inline int
+v_lanes64 (void)
+{
+  return 1;
+}
 static inline v_f64_t
 v_f64 (f64_t x)
 {
@@ -264,6 +269,16 @@ v_s64 (s64_t x)
 {
   return x;
 }
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x;
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  *x = v;
+}
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (v_u64_t x)
@@ -506,6 +521,11 @@ v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
 	     p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
 }
 
+static inline int
+v_lanes64 (void)
+{
+  return 2;
+}
 static inline v_f64_t
 v_f64 (f64_t x)
 {
@@ -521,6 +541,16 @@ v_s64 (s64_t x)
 {
   return (v_s64_t){x, x};
 }
+static inline f64_t
+v_get_f64 (v_f64_t x, int i)
+{
+  return x[i];
+}
+static inline void
+v_set_f64 (v_f64_t *x, int i, f64_t v)
+{
+  (*x)[i] = v;
+}
 /* true if any elements of a v_cond result is non-zero.  */
 static inline int
 v_any_u64 (v_u64_t x)
diff --git a/math/v_pow.c b/math/v_pow.c
new file mode 100644
index 0000000..a209d57
--- /dev/null
+++ b/math/v_pow.c
@@ -0,0 +1,27 @@
+/*
+ * Double-precision vector pow function.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#if V_SUPPORTED
+
+VPCS_ATTR
+v_f64_t
+V_NAME(pow) (v_f64_t x, v_f64_t y)
+{
+  v_f64_t z;
+  for (int lane = 0; lane < v_lanes64 (); lane++)
+    {
+      f64_t sx = v_get_f64 (x, lane);
+      f64_t sy = v_get_f64 (y, lane);
+      f64_t sz = pow (sx, sy);
+      v_set_f64 (&z, lane, sz);
+    }
+  return z;
+}
+VPCS_ALIAS
+#endif
diff --git a/math/vn_pow.c b/math/vn_pow.c
new file mode 100644
index 0000000..2609501
--- /dev/null
+++ b/math/vn_pow.c
@@ -0,0 +1,12 @@
+/*
+ * AdvSIMD vector PCS variant of __v_pow.
+ *
+ * Copyright (c) 2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+#include "mathlib.h"
+#ifdef __vpcs
+#define VPCS 1
+#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
+#include "v_pow.c"
+#endif
diff --git a/string/aarch64/memchr.S b/string/aarch64/memchr.S
index aff6e3d..10be49e 100644
--- a/string/aarch64/memchr.S
+++ b/string/aarch64/memchr.S
@@ -11,6 +11,8 @@
  * Neon Available.
  */
 
+#include "../asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -44,17 +46,9 @@
  * identify exactly which byte has matched.
  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn __memchr_aarch64
+ENTRY (__memchr_aarch64)
 	/* Do not dereference srcin if no bytes to compare.  */
-	cbz	cntin, .Lzero_length
+	cbz	cntin, L(zero_length)
 	/*
 	 * Magic constant 0x40100401 allows us to identify which lane matches
 	 * the requested byte.
@@ -67,7 +61,7 @@ def_fn __memchr_aarch64
 	dup	vrepmask.4s, wtmp2
 	ands	soff, srcin, #31
 	and	cntrem, cntin, #31
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/*
 	 * Input string is not 32-byte aligned. We calculate the syndrome
@@ -90,25 +84,25 @@ def_fn __memchr_aarch64
 	lsr	synd, synd, tmp
 	lsl	synd, synd, tmp
 	/* The first block can also be the last */
-	b.ls	.Lmasklast
+	b.ls	L(masklast)
 	/* Have we found something already? */
-	cbnz	synd, .Ltail
+	cbnz	synd, L(tail)
 
-.Lloop:
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	subs	cntin, cntin, #32
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
 	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
 	/* If we're out of data we finish regardless of the result */
-	b.ls	.Lend
+	b.ls	L(end)
 	/* Use a fast check for the termination condition */
 	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
 	addp	vend.2d, vend.2d, vend.2d
 	mov	synd, vend.d[0]
 	/* We're not out of data, loop if we haven't found the character */
-	cbz	synd, .Lloop
+	cbz	synd, L(loop)
 
-.Lend:
+L(end):
 	/* Termination condition found, let's calculate the syndrome value */
 	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
 	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
@@ -116,9 +110,9 @@ def_fn __memchr_aarch64
 	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
 	mov	synd, vend.d[0]
 	/* Only do the clear for the last possible block */
-	b.hi	.Ltail
+	b.hi	L(tail)
 
-.Lmasklast:
+L(masklast):
 	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
 	add	tmp, cntrem, soff
 	and	tmp, tmp, #31
@@ -127,7 +121,7 @@ def_fn __memchr_aarch64
 	lsl	synd, synd, tmp
 	lsr	synd, synd, tmp
 
-.Ltail:
+L(tail):
 	/* Count the trailing zeros using bit reversing */
 	rbit	synd, synd
 	/* Compensate the last post-increment */
@@ -142,8 +136,8 @@ def_fn __memchr_aarch64
 	csel	result, xzr, result, eq
 	ret
 
-.Lzero_length:
+L(zero_length):
 	mov	result, #0
 	ret
 
-	.size	__memchr_aarch64, . - __memchr_aarch64
+END (__memchr_aarch64)
diff --git a/string/aarch64/memcmp.S b/string/aarch64/memcmp.S
index 72a66bc..6722516 100644
--- a/string/aarch64/memcmp.S
+++ b/string/aarch64/memcmp.S
@@ -9,7 +9,7 @@
  * ARMv8-a, AArch64, unaligned accesses.
  */
 
-#define L(l) .L ## l
+#include "../asmdefs.h"
 
 /* Parameters and result.  */
 #define src1		x0
@@ -27,15 +27,7 @@
 #define tmp1		x7
 #define tmp2		x8
 
-        .macro def_fn f p2align=0
-        .text
-        .p2align \p2align
-        .global \f
-        .type \f, %function
-\f:
-        .endm
-
-def_fn __memcmp_aarch64 p2align=6
+ENTRY (__memcmp_aarch64)
 	subs	limit, limit, 8
 	b.lo	L(less8)
 
@@ -138,4 +130,4 @@ L(byte_loop):
 	sub	result, data1w, data2w
 	ret
 
-	.size __memcmp_aarch64, . - __memcmp_aarch64
+END (__memcmp_aarch64)
diff --git a/string/aarch64/memset.S b/string/aarch64/memset.S
index aef22e9..3868141 100644
--- a/string/aarch64/memset.S
+++ b/string/aarch64/memset.S
@@ -11,6 +11,7 @@
  *
  */
 
+#include "../asmdefs.h"
 
 #define dstin	x0
 #define val	x1
@@ -25,17 +26,7 @@
 #define zva_len x7
 #define zva_lenw w7
 
-#define L(l) .L ## l
-
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn __memset_aarch64 p2align=6
+ENTRY (__memset_aarch64)
 
 	dup	v0.16B, valw
 	add	dstend, dstin, count
@@ -185,4 +176,4 @@ L(zva_other):
 4:	add	count, count, zva_len
 	b	L(tail64)
 
-	.size	__memset_aarch64, . - __memset_aarch64
+END (__memset_aarch64)
diff --git a/string/aarch64/strchr.S b/string/aarch64/strchr.S
index cdb38aa..00d9be3 100644
--- a/string/aarch64/strchr.S
+++ b/string/aarch64/strchr.S
@@ -11,6 +11,8 @@
  * Neon Available.
  */
 
+#include "../asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -48,15 +50,7 @@
 
 /* Locals and temporaries.  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn __strchr_aarch64
+ENTRY (__strchr_aarch64)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the requested byte.  Magic constant 0x80200802 used
 	   similarly for NUL termination.  */
@@ -67,7 +61,7 @@ def_fn __strchr_aarch64
 	dup	vrepmask_c.4s, wtmp2
 	ands	tmp1, srcin, #31
 	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
@@ -93,9 +87,9 @@ def_fn __strchr_aarch64
 
 	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, .Ltail
+	cbnz	tmp1, L(tail)
 
-.Lloop:
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
@@ -107,7 +101,7 @@ def_fn __strchr_aarch64
 	orr	vend1.16b, vend1.16b, vend2.16b
 	addp	vend1.2d, vend1.2d, vend1.2d
 	mov	tmp1, vend1.d[0]
-	cbz	tmp1, .Lloop
+	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
@@ -121,7 +115,7 @@ def_fn __strchr_aarch64
 	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
 
 	mov	tmp1, vend1.d[0]
-.Ltail:
+L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
@@ -134,4 +128,4 @@ def_fn __strchr_aarch64
 	csel	result, result, xzr, eq
 	ret
 
-	.size	__strchr_aarch64, . - __strchr_aarch64
+END (__strchr_aarch64)
diff --git a/string/aarch64/strchrnul.S b/string/aarch64/strchrnul.S
index 4aee293..81264ea 100644
--- a/string/aarch64/strchrnul.S
+++ b/string/aarch64/strchrnul.S
@@ -11,6 +11,8 @@
  * Neon Available.
  */
 
+#include "../asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -44,15 +46,7 @@
 
 /* Locals and temporaries.  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn __strchrnul_aarch64
+ENTRY (__strchrnul_aarch64)
 	/* Magic constant 0x40100401 to allow us to identify which lane
 	   matches the termination condition.  */
 	mov	wtmp2, #0x0401
@@ -61,7 +55,7 @@ def_fn __strchrnul_aarch64
 	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
 	dup	vrepmask.4s, wtmp2
 	ands	tmp1, srcin, #31
-	b.eq	.Lloop
+	b.eq	L(loop)
 
 	/* Input string is not 32-byte aligned.  Rather than forcing
 	   the padding bytes to a safe value, we calculate the syndrome
@@ -85,9 +79,9 @@ def_fn __strchrnul_aarch64
 
 	mov	tmp3, vend1.d[0]
 	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, .Ltail
+	cbnz	tmp1, L(tail)
 
-.Lloop:
+L(loop):
 	ld1	{vdata1.16b, vdata2.16b}, [src], #32
 	cmeq	vhas_nul1.16b, vdata1.16b, #0
 	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
@@ -99,7 +93,7 @@ def_fn __strchrnul_aarch64
 	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
 	addp	vend1.2d, vend1.2d, vend1.2d
 	mov	tmp1, vend1.d[0]
-	cbz	tmp1, .Lloop
+	cbz	tmp1, L(loop)
 
 	/* Termination condition found.  Now need to establish exactly why
 	   we terminated.  */
@@ -109,7 +103,7 @@ def_fn __strchrnul_aarch64
 	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
 
 	mov	tmp1, vend1.d[0]
-.Ltail:
+L(tail):
 	/* Count the trailing zeros, by bit reversing...  */
 	rbit	tmp1, tmp1
 	/* Re-bias source.  */
@@ -119,4 +113,4 @@ def_fn __strchrnul_aarch64
 	add	result, src, tmp1, lsr #1
 	ret
 
-	.size	__strchrnul_aarch64, . - __strchrnul_aarch64
+END (__strchrnul_aarch64)
diff --git a/string/aarch64/strcmp.S b/string/aarch64/strcmp.S
index 2aa367c..65af5ce 100644
--- a/string/aarch64/strcmp.S
+++ b/string/aarch64/strcmp.S
@@ -10,15 +10,7 @@
  * ARMv8-a, AArch64
  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-#define L(label) .L ## label
+#include "../asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
@@ -44,7 +36,7 @@
 #define pos		x11
 
 	/* Start of performance-critical section  -- one 64B cache line.  */
-def_fn __strcmp_aarch64 p2align=6
+ENTRY (__strcmp_aarch64)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
@@ -174,4 +166,5 @@ L(loop_misaligned):
 L(done):
 	sub	result, data1, data2
 	ret
-	.size   __strcmp_aarch64, .-__strcmp_aarch64
+
+END (__strcmp_aarch64)
diff --git a/string/aarch64/strcpy.S b/string/aarch64/strcpy.S
index 4e10b4d..4edffcf 100644
--- a/string/aarch64/strcpy.S
+++ b/string/aarch64/strcpy.S
@@ -10,6 +10,8 @@
  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
+#include "../asmdefs.h"
+
 /* To build as stpcpy, define BUILD_STPCPY before compiling this file.
 
    To test the page crossing code path more thoroughly, compile with
@@ -46,14 +48,6 @@
 #define STRCPY __strcpy_aarch64
 #endif
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
@@ -85,7 +79,7 @@
 
 #define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
 
-def_fn STRCPY p2align=6
+ENTRY (STRCPY)
 	/* For moderately short strings, the fastest way to do the copy is to
 	   calculate the length of the string in the same way as strlen, then
 	   essentially do a memcpy of the result.  This avoids the need for
@@ -105,9 +99,9 @@ def_fn STRCPY p2align=6
 	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
 	   aligned string will never fail the page align check, so will
 	   always take the fast path.  */
-	b.gt	.Lpage_cross
+	b.gt	L(page_cross)
 
-.Lpage_cross_ok:
+L(page_cross_ok):
 	ldp	data1, data2, [srcin]
 #ifdef __AARCH64EB__
 	/* Because we expect the end to be found within 16 characters
@@ -119,7 +113,7 @@ def_fn STRCPY p2align=6
 	sub	tmp1, tmp2, zeroones
 	orr	tmp2, tmp2, #REP8_7f
 	bics	has_nul1, tmp1, tmp2
-	b.ne	.Lfp_le8
+	b.ne	L(fp_le8)
 	rev	tmp4, data2
 	sub	tmp3, tmp4, zeroones
 	orr	tmp4, tmp4, #REP8_7f
@@ -127,17 +121,17 @@ def_fn STRCPY p2align=6
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	bics	has_nul1, tmp1, tmp2
-	b.ne	.Lfp_le8
+	b.ne	L(fp_le8)
 	sub	tmp3, data2, zeroones
 	orr	tmp4, data2, #REP8_7f
 #endif
 	bics	has_nul2, tmp3, tmp4
-	b.eq	.Lbulk_entry
+	b.eq	L(bulk_entry)
 
 	/* The string is short (<=16 bytes).  We don't know exactly how
 	   short though, yet.  Work out the exact length so that we can
 	   quickly select the optimal copy strategy.  */
-.Lfp_gt8:
+L(fp_gt8):
 	rev	has_nul2, has_nul2
 	clz	pos, has_nul2
 	mov	tmp2, #56
@@ -155,12 +149,12 @@ def_fn STRCPY p2align=6
 #endif
 	ret
 
-.Lfp_le8:
+L(fp_le8):
 	rev	has_nul1, has_nul1
 	clz	pos, has_nul1
 	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
 	subs	tmp2, pos, #24			/* Pos in bits. */
-	b.lt	.Lfp_lt4
+	b.lt	L(fp_lt4)
 #ifdef __AARCH64EB__
 	mov	tmp2, #56
 	sub	pos, tmp2, pos
@@ -176,15 +170,15 @@ def_fn STRCPY p2align=6
 	mov	dstin, dst
 #endif
 	ret
-.Lfp_lt4:
-	cbz	pos, .Lfp_lt2
+L(fp_lt4):
+	cbz	pos, L(fp_lt2)
 	/* 2->3 bytes to copy.  */
 #ifdef __AARCH64EB__
 	lsr	data1, data1, #48
 #endif
 	strh	data1w, [dstin]
 	/* Fall-through, one byte (max) to go.  */
-.Lfp_lt2:
+L(fp_lt2):
 	/* Null-terminated string.  Last character must be zero!  */
 	strb	wzr, [dst]
 #ifdef BUILD_STPCPY
@@ -195,20 +189,20 @@ def_fn STRCPY p2align=6
 	.p2align 6
 	/* Aligning here ensures that the entry code and main loop all lies
 	   within one 64-byte cache line.  */
-.Lbulk_entry:
+L(bulk_entry):
 	sub	to_align, to_align, #16
 	stp	data1, data2, [dstin]
 	sub	src, srcin, to_align
 	sub	dst, dstin, to_align
-	b	.Lentry_no_page_cross
+	b	L(entry_no_page_cross)
 
 	/* The inner loop deals with two Dwords at a time.  This has a
 	   slightly higher start-up cost, but we should win quite quickly,
 	   especially on cores with a high number of issue slots per
 	   cycle, as we get much better parallelism out of the operations.  */
-.Lmain_loop:
+L(main_loop):
 	stp	data1, data2, [dst], #16
-.Lentry_no_page_cross:
+L(entry_no_page_cross):
 	ldp	data1, data2, [src], #16
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
@@ -217,7 +211,7 @@ def_fn STRCPY p2align=6
 	bic	has_nul1, tmp1, tmp2
 	bics	has_nul2, tmp3, tmp4
 	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lmain_loop
+	b.eq	L(main_loop)
 
 	/* Since we know we are copying at least 16 bytes, the fastest way
 	   to deal with the tail is to determine the location of the
@@ -250,7 +244,7 @@ def_fn STRCPY p2align=6
 #endif
 	ret
 
-.Lpage_cross:
+L(page_cross):
 	bic	src, srcin, #15
 	/* Start by loading two words at [srcin & ~15], then forcing the
 	   bytes that precede srcin to 0xff.  This means they never look
@@ -276,7 +270,7 @@ def_fn STRCPY p2align=6
 	bic	has_nul1, tmp1, tmp2
 	bics	has_nul2, tmp3, tmp4
 	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
-	b.eq	.Lpage_cross_ok
+	b.eq	L(page_cross_ok)
 	/* We now need to make data1 and data2 look like they've been
 	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
 	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
@@ -307,8 +301,8 @@ def_fn STRCPY p2align=6
 	orr	tmp4, data2, #REP8_7f
 #endif
 	bic	has_nul1, tmp1, tmp2
-	cbnz	has_nul1, .Lfp_le8
+	cbnz	has_nul1, L(fp_le8)
 	bic	has_nul2, tmp3, tmp4
-	b	.Lfp_gt8
+	b	L(fp_gt8)
 
-	.size	STRCPY, . - STRCPY
+END (STRCPY)
diff --git a/string/aarch64/strlen.S b/string/aarch64/strlen.S
index 26388d7..2293f73 100644
--- a/string/aarch64/strlen.S
+++ b/string/aarch64/strlen.S
@@ -10,6 +10,8 @@
  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
  */
 
+#include "../asmdefs.h"
+
 /* To test the page crossing code path more thoroughly, compile with
    -DTEST_PAGE_CROSS - this will force all calls through the slower
    entry path.  This option is not intended for production use.	 */
@@ -30,16 +32,6 @@
 #define tmp4		x7
 #define zeroones	x8
 
-#define L(l) .L ## l
-
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word. A faster check
@@ -81,7 +73,7 @@
 	   whether the first fetch, which may be misaligned, crosses a page
 	   boundary.  */
 
-def_fn __strlen_aarch64 p2align=6
+ENTRY (__strlen_aarch64)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
 	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
@@ -122,7 +114,7 @@ L(main_loop_entry):
 	sub	src, src, 16
 L(main_loop):
 	ldp	data1, data2, [src, 32]!
-.Lpage_cross_entry:
+L(page_cross_entry):
 	sub	tmp1, data1, zeroones
 	sub	tmp3, data2, zeroones
 	orr	tmp2, tmp1, tmp3
@@ -211,4 +203,4 @@ L(page_cross):
 	csel	data2, data2, tmp2, eq
 	b	L(page_cross_entry)
 
-	.size	__strlen_aarch64, . - __strlen_aarch64
+END (__strlen_aarch64)
diff --git a/string/aarch64/strncmp.S b/string/aarch64/strncmp.S
index ced72b9..fbd08ee 100644
--- a/string/aarch64/strncmp.S
+++ b/string/aarch64/strncmp.S
@@ -10,13 +10,7 @@
  * ARMv8-a, AArch64
  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
+#include "../asmdefs.h"
 
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
@@ -51,14 +45,14 @@
 	.rep 7
 	nop	/* Pad so that the loop below fits a cache line.  */
 	.endr
-def_fn __strncmp_aarch64
-	cbz	limit, .Lret0
+ENTRY_ALIGN (__strncmp_aarch64, 0)
+	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	mov	zeroones, #REP8_01
 	tst	tmp1, #7
 	and	count, src1, #7
-	b.ne	.Lmisaligned8
-	cbnz	count, .Lmutual_align
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #3	/* Convert to Dwords.  */
@@ -67,10 +61,10 @@ def_fn __strncmp_aarch64
 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
 	   can be done in parallel across the entire word.  */
 	/* Start of performance-critical section  -- one 64B cache line.  */
-.Lloop_aligned:
+L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned:
+L(start_realigned):
 	subs	limit_wd, limit_wd, #1
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
@@ -78,15 +72,15 @@ def_fn __strncmp_aarch64
 	csinv	endloop, diff, xzr, pl	/* Last Dword or differences.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	endloop, #0, #0, eq
-	b.eq	.Lloop_aligned
+	b.eq	L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
 
 	/* Not reached the limit, must have found the end or a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
+	tbz	limit_wd, #63, L(not_limit)
 
 	/* Limit % 8 == 0 => all bytes significant.  */
 	ands	limit, limit, #7
-	b.eq	.Lnot_limit
+	b.eq	L(not_limit)
 
 	lsl	limit, limit, #3	/* Bits -> bytes.  */
 	mov	mask, #~0
@@ -101,7 +95,7 @@ def_fn __strncmp_aarch64
 	/* Make sure that the NUL byte is marked in the syndrome.  */
 	orr	has_nul, has_nul, mask
 
-.Lnot_limit:
+L(not_limit):
 	orr	syndrome, diff, has_nul
 
 #ifndef	__AARCH64EB__
@@ -154,7 +148,7 @@ def_fn __strncmp_aarch64
 	ret
 #endif
 
-.Lmutual_align:
+L(mutual_align):
 	/* Sources are mutually aligned, but are not currently at an
 	   alignment boundary.  Round down the addresses and then mask off
 	   the bytes that precede the start point.
@@ -182,56 +176,56 @@ def_fn __strncmp_aarch64
 	orr	data1, data1, tmp2
 	orr	data2, data2, tmp2
 	add	limit_wd, limit_wd, tmp3, lsr #3
-	b	.Lstart_realigned
+	b	L(start_realigned)
 
 	.p2align 6
 	/* Don't bother with dwords for up to 16 bytes.  */
-.Lmisaligned8:
+L(misaligned8):
 	cmp	limit, #16
-	b.hs	.Ltry_misaligned_words
+	b.hs	L(try_misaligned_words)
 
-.Lbyte_loop:
+L(byte_loop):
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
 	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.eq	.Lbyte_loop
-.Ldone:
+	b.eq	L(byte_loop)
+L(done):
 	sub	result, data1, data2
 	ret
 	/* Align the SRC1 to a dword by doing a bytewise compare and then do
 	   the dword loop.  */
-.Ltry_misaligned_words:
+L(try_misaligned_words):
 	lsr	limit_wd, limit, #3
-	cbz	count, .Ldo_misaligned
+	cbz	count, L(do_misaligned)
 
 	neg	count, count
 	and	count, count, #7
 	sub	limit, limit, count
 	lsr	limit_wd, limit, #3
 
-.Lpage_end_loop:
+L(page_end_loop):
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	cmp	data1w, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
-	b.ne	.Ldone
+	b.ne	L(done)
 	subs	count, count, #1
-	b.hi	.Lpage_end_loop
+	b.hi	L(page_end_loop)
 
-.Ldo_misaligned:
+L(do_misaligned):
 	/* Prepare ourselves for the next page crossing.  Unlike the aligned
 	   loop, we fetch 1 less dword because we risk crossing bounds on
 	   SRC2.  */
 	mov	count, #8
 	subs	limit_wd, limit_wd, #1
-	b.lo	.Ldone_loop
-.Lloop_misaligned:
+	b.lo	L(done_loop)
+L(loop_misaligned):
 	and	tmp2, src2, #0xff8
 	eor	tmp2, tmp2, #0xff8
-	cbz	tmp2, .Lpage_end_loop
+	cbz	tmp2, L(page_end_loop)
 
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
@@ -240,14 +234,14 @@ def_fn __strncmp_aarch64
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	diff, #0, #0, eq
-	b.ne	.Lnot_limit
+	b.ne	L(not_limit)
 	subs	limit_wd, limit_wd, #1
-	b.pl	.Lloop_misaligned
+	b.pl	L(loop_misaligned)
 
-.Ldone_loop:
+L(done_loop):
 	/* We found a difference or a NULL before the limit was reached.  */
 	and	limit, limit, #7
-	cbz	limit, .Lnot_limit
+	cbz	limit, L(not_limit)
 	/* Read the last word.  */
 	sub	src1, src1, 8
 	sub	src2, src2, 8
@@ -258,9 +252,10 @@ def_fn __strncmp_aarch64
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
 	ccmp	diff, #0, #0, eq
-	b.ne	.Lnot_limit
+	b.ne	L(not_limit)
 
-.Lret0:
+L(ret0):
 	mov	result, #0
 	ret
-	.size __strncmp_aarch64, . - __strncmp_aarch64
+
+END ( __strncmp_aarch64)
diff --git a/string/aarch64/strnlen.S b/string/aarch64/strnlen.S
index b02c846..df66b60 100644
--- a/string/aarch64/strnlen.S
+++ b/string/aarch64/strnlen.S
@@ -10,6 +10,8 @@
  * ARMv8-a, AArch64
  */
 
+#include "../asmdefs.h"
+
 /* Arguments and results.  */
 #define srcin		x0
 #define len		x0
@@ -30,36 +32,28 @@
 #define pos		x13
 #define limit_wd	x14
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
 #define REP8_01 0x0101010101010101
 #define REP8_7f 0x7f7f7f7f7f7f7f7f
 #define REP8_80 0x8080808080808080
 
 	.text
 	.p2align	6
-.Lstart:
+L(start):
 	/* Pre-pad to ensure critical loop begins an icache line.  */
 	.rep 7
 	nop
 	.endr
 	/* Put this code here to avoid wasting more space with pre-padding.  */
-.Lhit_limit:
+L(hit_limit):
 	mov	len, limit
 	ret
 
-def_fn __strnlen_aarch64
-	cbz	limit, .Lhit_limit
+ENTRY_ALIGN (__strnlen_aarch64, 0)
+	cbz	limit, L(hit_limit)
 	mov	zeroones, #REP8_01
 	bic	src, srcin, #15
 	ands	tmp1, srcin, #15
-	b.ne	.Lmisaligned
+	b.ne	L(misaligned)
 	/* Calculate the number of full and partial words -1.  */
 	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
 	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
@@ -73,9 +67,9 @@ def_fn __strnlen_aarch64
 	   cycle, as we get much better parallelism out of the operations.  */
 
 	/* Start of critial section -- keep to one 64Byte cache line.  */
-.Lloop:
+L(loop):
 	ldp	data1, data2, [src], #16
-.Lrealigned:
+L(realigned):
 	sub	tmp1, data1, zeroones
 	orr	tmp2, data1, #REP8_7f
 	sub	tmp3, data2, zeroones
@@ -85,24 +79,24 @@ def_fn __strnlen_aarch64
 	subs	limit_wd, limit_wd, #1
 	orr	tmp1, has_nul1, has_nul2
 	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
-	b.eq	.Lloop
+	b.eq	L(loop)
 	/* End of critical section -- keep to one 64Byte cache line.  */
 
 	orr	tmp1, has_nul1, has_nul2
-	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
+	cbz	tmp1, L(hit_limit)	/* No null in final Qword.  */
 
 	/* We know there's a null in the final Qword.  The easiest thing
 	   to do now is work out the length of the string and return
 	   MIN (len, limit).  */
 
 	sub	len, src, srcin
-	cbz	has_nul1, .Lnul_in_data2
+	cbz	has_nul1, L(nul_in_data2)
 #ifdef __AARCH64EB__
 	mov	data2, data1
 #endif
 	sub	len, len, #8
 	mov	has_nul2, has_nul1
-.Lnul_in_data2:
+L(nul_in_data2):
 #ifdef __AARCH64EB__
 	/* For big-endian, carry propagation (if the final byte in the
 	   string is 0x01) means we cannot use has_nul directly.  The
@@ -121,7 +115,7 @@ def_fn __strnlen_aarch64
 	csel	len, len, limit, ls		/* Return the lower value.  */
 	ret
 
-.Lmisaligned:
+L(misaligned):
 	/* Deal with a partial first word.
 	   We're doing two things in parallel here;
 	   1) Calculate the number of words (but avoiding overflow if
@@ -156,5 +150,6 @@ def_fn __strnlen_aarch64
 
 	csinv	data1, data1, xzr, le
 	csel	data2, data2, data2a, le
-	b	.Lrealigned
-	.size	__strnlen_aarch64, . - .Lstart	/* Include pre-padding in size.  */
+	b	L(realigned)
+
+END (__strnlen_aarch64)
diff --git a/string/aarch64/strrchr.S b/string/aarch64/strrchr.S
new file mode 100644
index 0000000..1b4caac
--- /dev/null
+++ b/string/aarch64/strrchr.S
@@ -0,0 +1,147 @@
+/*
+ * strrchr - find last position of a character in a string.
+ *
+ * Copyright (c) 2014-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+#include "../asmdefs.h"
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+#define src_match	x6
+#define src_offset	x7
+#define const_m1	x8
+#define tmp4		x9
+#define nul_match	x10
+#define chr_match	x11
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+ENTRY (__strrchr_aarch64)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	mov	src_offset, #0
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	L(aligned)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	mov	nul_match, vhas_nul1.d[0]
+	lsl	tmp1, tmp1, #1
+	mov	const_m1, #~0
+	mov	chr_match, vhas_chr1.d[0]
+	lsr	tmp3, const_m1, tmp1
+
+	bic	nul_match, nul_match, tmp3	// Mask padding bits.
+	bic	chr_match, chr_match, tmp3	// Mask padding bits.
+	cbnz	nul_match, L(tail)
+
+L(loop):
+	cmp	chr_match, #0
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+L(aligned):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
+	mov	nul_match, vend1.d[0]
+	mov	chr_match, vhas_chr1.d[0]
+	cbz	nul_match, L(loop)
+
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+	mov	nul_match, vhas_nul1.d[0]
+
+L(tail):
+	/* Work out exactly where the string ends.  */
+	sub	tmp4, nul_match, #1
+	eor	tmp4, tmp4, nul_match
+	ands	chr_match, chr_match, tmp4
+	/* And pick the values corresponding to the last match.  */
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+
+	/* Count down from the top of the syndrome to find the last match.  */
+	clz	tmp3, src_offset
+	/* Src_match points beyond the word containing the match, so we can
+	   simply subtract half the bit-offset into the syndrome.  Because
+	   we are counting down, we need to go back one more character.  */
+	add	tmp3, tmp3, #2
+	sub	result, src_match, tmp3, lsr #1
+	/* But if the syndrome shows no match was found, then return NULL.  */
+	cmp	src_offset, #0
+	csel	result, result, xzr, ne
+
+	ret
+
+END (__strrchr_aarch64)
diff --git a/string/arm/memcpy.S b/string/arm/memcpy.S
index 3346e4f..aab78a2 100644
--- a/string/arm/memcpy.S
+++ b/string/arm/memcpy.S
@@ -17,6 +17,8 @@
 
  */
 
+#include "../asmdefs.h"
+
 	.syntax unified
 	/* This implementation requires ARM state.  */
 	.arm
@@ -118,23 +120,15 @@
 	.endm
 #endif
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
-def_fn __memcpy_arm p2align=6
+ENTRY (__memcpy_arm)
 
 	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
 	cmp	count, #64
-	bge	.Lcpy_not_short
+	bge	L(cpy_not_short)
 	/* Deal with small copies quickly by dropping straight into the
 	   exit block.  */
 
-.Ltail63unaligned:
+L(tail63unaligned):
 #ifdef USE_NEON
 	and	tmp1, count, #0x38
 	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
@@ -213,13 +207,13 @@ def_fn __memcpy_arm p2align=6
 	strbne	src, [dst]
 	bx	lr
 
-.Lcpy_not_short:
+L(cpy_not_short):
 	/* At least 64 bytes to copy, but don't know the alignment yet.  */
 	str	tmp2, [sp, #-FRAME_SIZE]!
 	and	tmp2, src, #7
 	and	tmp1, dst, #7
 	cmp	tmp1, tmp2
-	bne	.Lcpy_notaligned
+	bne	L(cpy_notaligned)
 
 #ifdef USE_VFP
 	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
@@ -245,12 +239,12 @@ def_fn __memcpy_arm p2align=6
 
 1:
 	subs	tmp2, count, #64	/* Use tmp2 for count.  */
-	blt	.Ltail63aligned
+	blt	L(tail63aligned)
 
 	cmp	tmp2, #512
-	bge	.Lcpy_body_long
+	bge	L(cpy_body_long)
 
-.Lcpy_body_medium:			/* Count in tmp2.  */
+L(cpy_body_medium):			/* Count in tmp2.  */
 #ifdef USE_VFP
 1:
 	vldr	d0, [src, #0]
@@ -274,9 +268,9 @@ def_fn __memcpy_arm p2align=6
 	add	dst, dst, #64
 	bge	1b
 	tst	tmp2, #0x3f
-	beq	.Ldone
+	beq	L(done)
 
-.Ltail63aligned:			/* Count in tmp2.  */
+L(tail63aligned):			/* Count in tmp2.  */
 	and	tmp1, tmp2, #0x38
 	add	dst, dst, tmp1
 	add	src, src, tmp1
@@ -327,7 +321,7 @@ def_fn __memcpy_arm p2align=6
 	add	src, src, #8
 	add	dst, dst, #8
 
-.Ltail63aligned:			/* Count in tmp2.  */
+L(tail63aligned):			/* Count in tmp2.  */
 	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
 	   we know that the src and dest are 64-bit aligned so we can use
 	   LDRD/STRD to improve efficiency.  */
@@ -364,11 +358,11 @@ def_fn __memcpy_arm p2align=6
 	strhcs	tmp1, [dst], #2
 	strbne	tmp2, [dst]
 
-.Ldone:
+L(done):
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bx	lr
 
-.Lcpy_body_long:			/* Count in tmp2.  */
+L(cpy_body_long):			/* Count in tmp2.  */
 
 	/* Long copy.  We know that there's at least (prefetch_lines * 64)
 	   bytes to go.  */
@@ -425,7 +419,7 @@ def_fn __memcpy_arm p2align=6
 	vstr	d2, [dst, #64 + 56]
 	add	dst, dst, #128
 	add	tmp2, tmp2, #prefetch_lines * 64
-	b	.Lcpy_body_medium
+	b	L(cpy_body_medium)
 #else
 	/* Long copy.  Use an SMS style loop to maximize the I/O
 	   bandwidth of the core.  We don't have enough spare registers
@@ -479,12 +473,12 @@ def_fn __memcpy_arm p2align=6
 	ldrd	D_l, D_h, [sp, #24]
 	add	dst, dst, #72
 	tst	tmp2, #0x3f
-	bne	.Ltail63aligned
+	bne	L(tail63aligned)
 	ldr	tmp2, [sp], #FRAME_SIZE
 	bx	lr
 #endif
 
-.Lcpy_notaligned:
+L(cpy_notaligned):
 	pld	[src]
 	pld	[src, #64]
 	/* There's at least 64 bytes to copy, but there is no mutual
@@ -506,7 +500,7 @@ def_fn __memcpy_arm p2align=6
 	pld	[src, #(3 * 64)]
 	subs	count, count, #64
 	ldrmi	tmp2, [sp], #FRAME_SIZE
-	bmi	.Ltail63unaligned
+	bmi	L(tail63unaligned)
 	pld	[src, #(4 * 64)]
 
 #ifdef USE_NEON
@@ -587,7 +581,7 @@ def_fn __memcpy_arm p2align=6
 	ands	count, tmp2, #0x3f
 #endif
 	ldr	tmp2, [sp], #FRAME_SIZE
-	bne	.Ltail63unaligned
+	bne	L(tail63unaligned)
 	bx	lr
 
-	.size	__memcpy_arm, . - __memcpy_arm
+END (__memcpy_arm)
diff --git a/string/arm/strcmp-armv6m.S b/string/arm/strcmp-armv6m.S
index 5ea06c9..d615231 100644
--- a/string/arm/strcmp-armv6m.S
+++ b/string/arm/strcmp-armv6m.S
@@ -26,12 +26,7 @@
 	DoSub	\n, \label
 	.endm
 
-	.text
-	.p2align	0
-	.global	__strcmp_armv6m
-	.type	__strcmp_armv6m, %function
-__strcmp_armv6m:
-	.cfi_startproc
+ENTRY_ALIGN (__strcmp_armv6m, 4)
 	mov	r2, r0
 	push	{r4, r5, r6, lr}
 	orrs	r2, r1
@@ -114,5 +109,5 @@ __strcmp_armv6m:
 7:
 	subs	r0, r2, r3
 	pop	{r4, r5, r6, pc}
-	.cfi_endproc
-	.size	__strcmp_armv6m, . - __strcmp_armv6m
+
+END (__strcmp_armv6m)
diff --git a/string/arm/strcmp.S b/string/arm/strcmp.S
index fb9cae3..295db8b 100644
--- a/string/arm/strcmp.S
+++ b/string/arm/strcmp.S
@@ -10,6 +10,8 @@
    is sufficiently aligned.  Use saturating arithmetic to optimize
    the compares.  */
 
+#include "../asmdefs.h"
+
 /* Build Options:
    STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
    byte in the string.  If comparing completely random strings
@@ -48,14 +50,6 @@
 #define LSB 0x000000ff
 #endif /* not  __ARM_BIG_ENDIAN */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
-
 /* Parameters and result.  */
 #define src1		r0
 #define src2		r1
@@ -131,23 +125,22 @@
 
 	.text
 	.p2align	5
-.Lstrcmp_start_addr:
+L(strcmp_start_addr):
 #if STRCMP_NO_PRECHECK == 0
-.Lfastpath_exit:
+L(fastpath_exit):
 	sub	r0, r2, r3
 	bx	lr
 	nop
 #endif
-def_fn	__strcmp_arm
+ENTRY_ALIGN (__strcmp_arm, 0)
 #if STRCMP_NO_PRECHECK == 0
 	ldrb	r2, [src1]
 	ldrb	r3, [src2]
 	cmp	r2, #1
 	it	cs
 	cmpcs	r2, r3
-	bne	.Lfastpath_exit
+	bne	L(fastpath_exit)
 #endif
-	.cfi_startproc
 	strd	r4, r5, [sp, #-16]!
 	.cfi_def_cfa_offset 16
 	.cfi_offset 4, -16
@@ -158,12 +151,12 @@ def_fn	__strcmp_arm
 	.cfi_offset 7, -4
 	mvn	const_m1, #0
 	lsl	r2, tmp1, #29
-	cbz	r2, .Lloop_aligned8
+	cbz	r2, L(loop_aligned8)
 
-.Lnot_aligned:
+L(not_aligned):
 	eor	tmp1, src1, src2
 	tst	tmp1, #7
-	bne	.Lmisaligned8
+	bne	L(misaligned8)
 
 	/* Deal with mutual misalignment by aligning downwards and then
 	   masking off the unwanted loaded data to prevent a difference.  */
@@ -180,29 +173,29 @@ def_fn	__strcmp_arm
 	S2HI	tmp1, const_m1, tmp2
 	orn	data1a, data1a, tmp1
 	orn	data2a, data2a, tmp1
-	beq	.Lstart_realigned8
+	beq	L(start_realigned8)
 	orn	data1b, data1b, tmp1
 	mov	data1a, const_m1
 	orn	data2b, data2b, tmp1
 	mov	data2a, const_m1
-	b	.Lstart_realigned8
+	b	L(start_realigned8)
 
 	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
 	   pass.  */
 	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
 	.p2align 2	/* Always word aligned.  */
-.Lloop_aligned8:
+L(loop_aligned8):
 	ldrd	data1a, data1b, [src1], #16
 	ldrd	data2a, data2b, [src2], #16
-.Lstart_realigned8:
+L(start_realigned8):
 	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
 	eor	syndrome_a, data1a, data2a
 	sel	syndrome_a, syndrome_a, const_m1
-	cbnz	syndrome_a, .Ldiff_in_a
+	cbnz	syndrome_a, L(diff_in_a)
 	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
 	eor	syndrome_b, data1b, data2b
 	sel	syndrome_b, syndrome_b, const_m1
-	cbnz	syndrome_b, .Ldiff_in_b
+	cbnz	syndrome_b, L(diff_in_b)
 
 	ldrd	data1a, data1b, [src1, #-8]
 	ldrd	data2a, data2b, [src2, #-8]
@@ -214,47 +207,47 @@ def_fn	__strcmp_arm
 	sel	syndrome_b, syndrome_b, const_m1
 	/* Can't use CBZ for backwards branch.  */
 	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
-	beq	.Lloop_aligned8
+	beq	L(loop_aligned8)
 
-.Ldiff_found:
-	cbnz	syndrome_a, .Ldiff_in_a
+L(diff_found):
+	cbnz	syndrome_a, L(diff_in_a)
 
-.Ldiff_in_b:
+L(diff_in_b):
 	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
 
-.Ldiff_in_a:
+L(diff_in_a):
 	.cfi_restore_state
 	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
 
 	.cfi_restore_state
-.Lmisaligned8:
+L(misaligned8):
 	tst	tmp1, #3
-	bne	.Lmisaligned4
+	bne	L(misaligned4)
 	ands	tmp1, src1, #3
-	bne	.Lmutual_align4
+	bne	L(mutual_align4)
 
 	/* Unrolled by a factor of 2, to reduce the number of post-increment
 	   operations.  */
-.Lloop_aligned4:
+L(loop_aligned4):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
-.Lstart_realigned4:
+L(start_realigned4):
 	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
 	eor	syndrome, data1, data2
 	sel	syndrome, syndrome, const_m1
-	cbnz	syndrome, .Laligned4_done
+	cbnz	syndrome, L(aligned4_done)
 	ldr	data1, [src1, #-4]
 	ldr	data2, [src2, #-4]
 	uadd8	syndrome, data1, const_m1
 	eor	syndrome, data1, data2
 	sel	syndrome, syndrome, const_m1
 	cmp	syndrome, #0
-	beq	.Lloop_aligned4
+	beq	L(loop_aligned4)
 
-.Laligned4_done:
+L(aligned4_done):
 	strcmp_epilogue_aligned syndrome, data1, data2, 0
 
-.Lmutual_align4:
+L(mutual_align4):
 	.cfi_restore_state
 	/* Deal with mutual misalignment by aligning downwards and then
 	   masking off the unwanted loaded data to prevent a difference.  */
@@ -269,57 +262,57 @@ def_fn	__strcmp_arm
 	S2HI	tmp1, const_m1, tmp1
 	orn	data1, data1, tmp1
 	orn	data2, data2, tmp1
-	b	.Lstart_realigned4
+	b	L(start_realigned4)
 
-.Lmisaligned4:
+L(misaligned4):
 	ands	tmp1, src1, #3
-	beq	.Lsrc1_aligned
+	beq	L(src1_aligned)
 	sub	src2, src2, tmp1
 	bic	src1, src1, #3
 	lsls	tmp1, tmp1, #31
 	ldr	data1, [src1], #4
-	beq	.Laligned_m2
-	bcs	.Laligned_m1
+	beq	L(aligned_m2)
+	bcs	L(aligned_m1)
 
 #if STRCMP_NO_PRECHECK == 1
 	ldrb	data2, [src2, #1]
 	uxtb	tmp1, data1, ror #BYTE1_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
-	cbz	data2, .Lmisaligned_exit
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
 
-.Laligned_m2:
+L(aligned_m2):
 	ldrb	data2, [src2, #2]
 	uxtb	tmp1, data1, ror #BYTE2_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
-	cbz	data2, .Lmisaligned_exit
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
 
-.Laligned_m1:
+L(aligned_m1):
 	ldrb	data2, [src2, #3]
 	uxtb	tmp1, data1, ror #BYTE3_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
+	bne	L(misaligned_exit)
 	add	src2, src2, #4
-	cbnz	data2, .Lsrc1_aligned
+	cbnz	data2, L(src1_aligned)
 #else  /* STRCMP_NO_PRECHECK */
 	/* If we've done the pre-check, then we don't need to check the
 	   first byte again here.  */
 	ldrb	data2, [src2, #2]
 	uxtb	tmp1, data1, ror #BYTE2_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
-	cbz	data2, .Lmisaligned_exit
+	bne	L(misaligned_exit)
+	cbz	data2, L(misaligned_exit)
 
-.Laligned_m2:
+L(aligned_m2):
 	ldrb	data2, [src2, #3]
 	uxtb	tmp1, data1, ror #BYTE3_OFFSET
 	subs	tmp1, tmp1, data2
-	bne	.Lmisaligned_exit
-	cbnz	data2, .Laligned_m1
+	bne	L(misaligned_exit)
+	cbnz	data2, L(aligned_m1)
 #endif
 
-.Lmisaligned_exit:
+L(misaligned_exit):
 	.cfi_remember_state
 	mov	result, tmp1
 	ldr	r4, [sp], #16
@@ -327,10 +320,10 @@ def_fn	__strcmp_arm
 	bx	lr
 
 #if STRCMP_NO_PRECHECK == 0
-.Laligned_m1:
+L(aligned_m1):
 	add	src2, src2, #4
 #endif
-.Lsrc1_aligned:
+L(src1_aligned):
 	.cfi_restore_state
 	/* src1 is word aligned, but src2 has no common alignment
 	   with it.  */
@@ -339,11 +332,11 @@ def_fn	__strcmp_arm
 
 	bic	src2, src2, #3
 	ldr	data2, [src2], #4
-	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
-	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
+	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
+	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
 
 	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
-.Loverlap3:
+L(overlap3):
 	bic	tmp1, data1, #MSB
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #8
@@ -355,14 +348,14 @@ def_fn	__strcmp_arm
 	cmp	tmp1, data2, S2HI #24
 	bne	6f
 	ldr	data1, [src1], #4
-	b	.Loverlap3
+	b	L(overlap3)
 4:
 	S2LO	data2, data2, #8
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
 5:
 	bics	syndrome, syndrome, #MSB
-	bne	.Lstrcmp_done_equal
+	bne	L(strcmp_done_equal)
 
 	/* We can only get here if the MSB of data1 contains 0, so
 	   fast-path the exit.  */
@@ -381,10 +374,10 @@ def_fn	__strcmp_arm
 	.cfi_restore_state
 	S2LO	data1, data1, #24
 	and	data2, data2, #LSB
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
 	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
-.Loverlap2:
+L(overlap2):
 	and	tmp1, data1, const_m1, S2LO #16
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #16
@@ -396,28 +389,28 @@ def_fn	__strcmp_arm
 	cmp	tmp1, data2, S2HI #16
 	bne	6f
 	ldr	data1, [src1], #4
-	b	.Loverlap2
+	b	L(overlap2)
 4:
 	S2LO	data2, data2, #16
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 5:
 	ands	syndrome, syndrome, const_m1, S2LO #16
-	bne	.Lstrcmp_done_equal
+	bne	L(strcmp_done_equal)
 
 	ldrh	data2, [src2]
 	S2LO	data1, data1, #16
 #ifdef __ARM_BIG_ENDIAN
 	lsl	data2, data2, #16
 #endif
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
 6:
 	S2LO	data1, data1, #16
 	and	data2, data2, const_m1, S2LO #16
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
 	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
-.Loverlap1:
+L(overlap1):
 	and	tmp1, data1, #LSB
 	uadd8	syndrome, data1, const_m1
 	eors	syndrome, tmp1, data2, S2LO #24
@@ -429,20 +422,20 @@ def_fn	__strcmp_arm
 	cmp	tmp1, data2, S2HI #8
 	bne	6f
 	ldr	data1, [src1], #4
-	b	.Loverlap1
+	b	L(overlap1)
 4:
 	S2LO	data2, data2, #24
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 5:
 	tst	syndrome, #LSB
-	bne	.Lstrcmp_done_equal
+	bne	L(strcmp_done_equal)
 	ldr	data2, [src2]
 6:
 	S2LO	data1, data1, #8
 	bic	data2, data2, #MSB
-	b	.Lstrcmp_tail
+	b	L(strcmp_tail)
 
-.Lstrcmp_done_equal:
+L(strcmp_done_equal):
 	mov	result, #0
 	.cfi_remember_state
 	ldrd	r4, r5, [sp], #16
@@ -453,7 +446,7 @@ def_fn	__strcmp_arm
 	.cfi_restore 7
 	bx	lr
 
-.Lstrcmp_tail:
+L(strcmp_tail):
 	.cfi_restore_state
 #ifndef __ARM_BIG_ENDIAN
 	rev	data1, data1
@@ -475,5 +468,5 @@ def_fn	__strcmp_arm
 	.cfi_restore 7
 	sub	result, result, data2, lsr #24
 	bx	lr
-	.cfi_endproc
-	.size __strcmp, . - .Lstrcmp_start_addr
+
+END (__strcmp_arm)
diff --git a/string/arm/strlen-armv6t2.S b/string/arm/strlen-armv6t2.S
index 279ec87..76e6930 100644
--- a/string/arm/strlen-armv6t2.S
+++ b/string/arm/strlen-armv6t2.S
@@ -11,13 +11,7 @@
 
  */
 
-	.macro def_fn f p2align=0
-	.text
-	.p2align \p2align
-	.global \f
-	.type \f, %function
-\f:
-	.endm
+#include "../asmdefs.h"
 
 #ifdef __ARMEB__
 #define S2LO		lsl
@@ -44,27 +38,27 @@
 #define tmp1		r4		/* Overlaps const_0  */
 #define tmp2		r5
 
-def_fn	__strlen_armv6t2 p2align=6
+ENTRY (__strlen_armv6t2)
 	pld	[srcin, #0]
 	strd	r4, r5, [sp, #-8]!
 	bic	src, srcin, #7
 	mvn	const_m1, #0
 	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
 	pld	[src, #32]
-	bne.w	.Lmisaligned8
+	bne.w	L(misaligned8)
 	mov	const_0, #0
 	mov	result, #-8
-.Lloop_aligned:
+L(loop_aligned):
 	/* Bytes 0-7.  */
 	ldrd	data1a, data1b, [src]
 	pld	[src, #64]
 	add	result, result, #8
-.Lstart_realigned:
+L(start_realigned):
 	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
-	cbnz	data1b, .Lnull_found
+	cbnz	data1b, L(null_found)
 
 	/* Bytes 8-15.  */
 	ldrd	data1a, data1b, [src, #8]
@@ -73,7 +67,7 @@ def_fn	__strlen_armv6t2 p2align=6
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
-	cbnz	data1b, .Lnull_found
+	cbnz	data1b, L(null_found)
 
 	/* Bytes 16-23.  */
 	ldrd	data1a, data1b, [src, #16]
@@ -82,7 +76,7 @@ def_fn	__strlen_armv6t2 p2align=6
 	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
-	cbnz	data1b, .Lnull_found
+	cbnz	data1b, L(null_found)
 
 	/* Bytes 24-31.  */
 	ldrd	data1a, data1b, [src, #24]
@@ -93,9 +87,9 @@ def_fn	__strlen_armv6t2 p2align=6
 	uadd8	data1b, data1b, const_m1
 	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
 	cmp	data1b, #0
-	beq	.Lloop_aligned
+	beq	L(loop_aligned)
 
-.Lnull_found:
+L(null_found):
 	cmp	data1a, #0
 	itt	eq
 	addeq	result, result, #4
@@ -108,7 +102,7 @@ def_fn	__strlen_armv6t2 p2align=6
 	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
 	bx	lr
 
-.Lmisaligned8:
+L(misaligned8):
 	ldrd	data1a, data1b, [src]
 	and	tmp2, tmp1, #3
 	rsb	result, tmp1, #0
@@ -121,5 +115,6 @@ def_fn	__strlen_armv6t2 p2align=6
 	ornne	data1b, data1b, tmp2
 	movne	data1a, const_m1
 	mov	const_0, #0
-	b	.Lstart_realigned
-	.size	__strlen_armv6t2, . - __strlen_armv6t2
+	b	L(start_realigned)
+
+END (__strlen_armv6t2)
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index 3f60220..b3b6181 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -13,7 +13,6 @@
 #endif
 
 #if __aarch64__
-void *__memcpy_bytewise (void *__restrict, const void *__restrict, size_t);
 void *__memcpy_aarch64 (void *__restrict, const void *__restrict, size_t);
 void *__memmove_aarch64 (void *, const void *, size_t);
 void *__memset_aarch64 (void *, int, size_t);
@@ -22,6 +21,7 @@ int __memcmp_aarch64 (const void *, const void *, size_t);
 char *__strcpy_aarch64 (char *__restrict, const char *__restrict);
 int __strcmp_aarch64 (const char *, const char *);
 char *__strchr_aarch64 (const char *, int);
+char *__strrchr_aarch64 (const char *, int);
 char *__strchrnul_aarch64 (const char *, int );
 size_t __strlen_aarch64 (const char *);
 size_t __strnlen_aarch64 (const char *, size_t);
diff --git a/string/memcpy_bytewise.S b/string/memcpy_bytewise.S
deleted file mode 100644
index 7ee3474..0000000
--- a/string/memcpy_bytewise.S
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Trivial AArch64 memcpy.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#if __aarch64__
-#include "asmdefs.h"
-
-ENTRY (__memcpy_bytewise)
-	cbz	x2, 2f
-	mov	x3, 0
-1:
-	ldrb	w4, [x1, x3]
-	strb	w4, [x0, x3]
-	add	x3, x3, 1
-	cmp	x3, x2
-	bne	1b
-2:
-	ret
-END (__memcpy_bytewise)
-#endif
diff --git a/string/strrchr.S b/string/strrchr.S
index 18b1cf9..119b1d5 100644
--- a/string/strrchr.S
+++ b/string/strrchr.S
@@ -6,6 +6,7 @@
  */
 
 #if __aarch64__
+#include "aarch64/strrchr.S"
 # if __ARM_FEATURE_SVE
 #include "aarch64/strrchr-sve.S"
 # endif
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index e31f359..8572452 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -19,7 +19,6 @@ static const struct fun
 #define F(x) {#x, x},
 F(memcpy)
 #if __aarch64__
-F(__memcpy_bytewise)
 F(__memcpy_aarch64)
 # if __ARM_NEON
 F(__memcpy_aarch64_simd)
diff --git a/string/test/strrchr.c b/string/test/strrchr.c
index 6277fae..532fa51 100644
--- a/string/test/strrchr.c
+++ b/string/test/strrchr.c
@@ -20,6 +20,7 @@ static const struct fun
 #define F(x) {#x, x},
 F(strrchr)
 #if __aarch64__
+F(__strrchr_aarch64)
 # if __ARM_FEATURE_SVE
 F(__strrchr_aarch64_sve)
 # endif